238 lines
7.4 KiB
Python
238 lines
7.4 KiB
Python
import os
|
|
|
|
import pandas
|
|
import yfinance as yf
|
|
from get_all_tickers.get_tickers import get_tickers
|
|
from tensorflow.keras.callbacks import History
|
|
from pandas import DataFrame
|
|
|
|
PROJ_PATH = os.path.dirname(__file__)
|
|
ticker_data_file = os.path.join(PROJ_PATH, "ticker_data_NYSE.hdf")
|
|
|
|
model_weights_file = os.path.join(PROJ_PATH, "model_weights.bin")
|
|
|
|
|
|
def main():
|
|
all_tickers = get_tickers(NASDAQ=False, AMEX=False)
|
|
print(all_tickers)
|
|
|
|
data = yf.download(tickers="SPL AAPL", start="2017-01-01", end="2017-04-30", interval="60m", )
|
|
# data = yf.download(tickers="SPL AAPL", start="2017-01-01", end="2017-04-30", interval="1d", )
|
|
data = yf.download(tickers=" ".join(all_tickers), interval="1d")
|
|
# yf.download()
|
|
|
|
print(data)
|
|
print(type(data))
|
|
|
|
# data.to_csv(ticker_data_file)
|
|
|
|
|
|
def test():
|
|
nyse_tickers = get_tickers(NASDAQ=False, AMEX=False)
|
|
print(nyse_tickers)
|
|
# data = yf.download(tickers="SPL AAPL", start="2017-01-01", end="2017-04-30", interval="60m", )
|
|
data = yf.download(tickers="SPL AAPL", start="2017-01-01", end="2017-04-30", interval="1d", )
|
|
print(data)
|
|
print(data.keys())
|
|
data.to_hdf(ticker_data_file, key='Date')
|
|
exit()
|
|
|
|
|
|
def download_test_data():
|
|
if os.path.exists(ticker_data_file):
|
|
print("file already exists, won't download")
|
|
return
|
|
nyse_tickers = get_tickers(NASDAQ=False, AMEX=False)
|
|
data = yf.download(tickers=" ".join(nyse_tickers), interval="1d")
|
|
print("storing data ...")
|
|
data.to_hdf(ticker_data_file, key="Date")
|
|
print("storing data Done")
|
|
|
|
|
|
def transform_data() -> DataFrame:
|
|
print("reading data ...")
|
|
data = pandas.read_hdf(ticker_data_file)
|
|
print("reading done ...")
|
|
|
|
return data
|
|
|
|
|
|
def get_day_x_delta_data(df: DataFrame, day_delta: int = 1, absolute=False) -> DataFrame:
|
|
if absolute:
|
|
return df.diff(day_delta).add_suffix('_{}d'.format(day_delta))
|
|
return (df.diff(day_delta) * 100 / df).add_suffix('_{}d%'.format(day_delta))
|
|
|
|
|
|
def create_model():
|
|
model = Sequential()
|
|
# exit() ## exit
|
|
model.add(Dense(350, input_dim=x_train.shape[1], activation="relu"))
|
|
# The input_dim =44, since the width of the training data=44 (refer data engg section)
|
|
model.add(Dense(350, activation="relu"))
|
|
model.add(Dense(350, activation="relu"))
|
|
model.add(Dense(350, activation="relu"))
|
|
model.add(Dense(350, activation="relu"))
|
|
model.add(Dense(1, activation="linear"))
|
|
# Configure the model
|
|
model.compile(optimizer='adam', loss="mean_absolute_error",
|
|
metrics=["mean_absolute_error"])
|
|
return model
|
|
|
|
|
|
def train_model(model=None):
|
|
# Train the model
|
|
if model is None:
|
|
model = create_model()
|
|
history = History()
|
|
|
|
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=15, batch_size=64, callbacks=[history])
|
|
|
|
# Use the model's evaluate method to predict and evaluate the test datasets
|
|
result = model.evaluate(x_test.values, y_test.values)
|
|
# Print the results
|
|
for i in range(len(model.metrics_names)):
|
|
print("Metric ", model.metrics_names[i], ":", str(round(result[i], 2)))
|
|
|
|
model.save_weights(model_weights_file)
|
|
|
|
plt.plot(history.history['loss'])
|
|
plt.plot(history.history['val_loss'])
|
|
plt.title("Model's Training & Validation loss across epochs")
|
|
plt.ylabel('Loss')
|
|
plt.xlabel('Epochs')
|
|
plt.legend(['Train', 'Validation'], loc='upper right')
|
|
plt.show()
|
|
|
|
return model
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# main()
|
|
try:
|
|
data = transform_data()
|
|
except FileNotFoundError:
|
|
download_test_data()
|
|
data = transform_data()
|
|
|
|
print("keys:")
|
|
print(data.keys())
|
|
print("index:")
|
|
print(data.index)
|
|
|
|
data = data.drop('Volume', axis=1, level=0) # drop Volume data
|
|
data = data.drop('Open', axis=1, level=0) # drop Open data
|
|
data = data.drop('Low', axis=1, level=0) # drop Low data
|
|
data = data.drop('High', axis=1, level=0) # drop High data
|
|
data = data.drop('Close', axis=1, level=0) # drop Close data
|
|
|
|
# data.columns = [' '.join(col).strip() for col in data.columns.values]
|
|
# only keep level 1 (of 0 and 1) columns:
|
|
data.columns = data.columns.get_level_values(1)
|
|
|
|
print(data)
|
|
print(data.keys())
|
|
print(data.iloc[0])
|
|
|
|
# print(get_day_x_delta_data(data, 1, True))
|
|
SYM = 'ZTS'
|
|
data.dropna(subset=[SYM], inplace=True)
|
|
d1 = get_day_x_delta_data(data, 1)
|
|
d7 = get_day_x_delta_data(data, 7)
|
|
# print(d1.corr())
|
|
|
|
print(d1.head())
|
|
|
|
# d1.dropna(subset=[f'{SYM}_1d%'], inplace=True)
|
|
# d7.dropna(subset=[f'{SYM}_7d%'], inplace=True)
|
|
d1[f'{SYM}_7d%'] = d7[f'{SYM}_7d%'] # add 7 day data to one day relative performance data
|
|
|
|
d1.dropna(subset=[f'{SYM}_1d%', f'{SYM}_7d%'], inplace=True)
|
|
d7.dropna(subset=[f'{SYM}_7d%'], inplace=True)
|
|
|
|
d1.fillna(0, inplace=True)
|
|
d7.fillna(0, inplace=True)
|
|
print(d1.head(100))
|
|
|
|
print(d1.shape)
|
|
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns # Seaborn is another powerful visualization library for Python
|
|
|
|
# sns.lineplot(data=data, x='Date', y=f'{SYM}')
|
|
# sns.lineplot(data=d1, x='Date', y=f'{SYM}_1d%')
|
|
# plt.show()
|
|
|
|
performance = 1
|
|
for v in d1[f'{SYM}_1d%']:
|
|
performance *= (1 + (v / 100))
|
|
print(performance)
|
|
|
|
print("Distinct Datatypes:", data.dtypes.unique())
|
|
|
|
target = [f'{SYM}_7d%']
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
# Create train and test dataset with an 80:20 split
|
|
x_train, x_test, y_train, y_test = train_test_split(d1, d7[target], test_size=0.2, random_state=2018)
|
|
# Further divide training dataset into train and validation dataset with an 90:10 split
|
|
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=2018)
|
|
|
|
print("Shape of x_train:", x_train.shape)
|
|
print("Shape of x_val:", x_val.shape)
|
|
print("Shape of x_test:", x_test.shape)
|
|
print("Shape of y_train:", y_train.shape)
|
|
print("Shape of y_val:", y_val.shape)
|
|
print("Shape of y_test:", y_test.shape)
|
|
|
|
print(y_test)
|
|
|
|
# calculate the average score of the train dataset
|
|
mean_sales = y_train.mean()
|
|
print("Average Sales :", mean_sales)
|
|
# Calculate the Mean Absolute Error on the test dataset
|
|
print("MAE for Test Data:", abs(y_test - mean_sales).mean())
|
|
|
|
import os
|
|
|
|
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
|
|
os.environ["PLAIDML_EXPERIMENTAL"] = "1"
|
|
os.environ["PLAIDML_DEVICE_IDS"] = "opencl_amd_gfx1010.0"
|
|
import plaidml.keras
|
|
|
|
plaidml.keras.install_backend()
|
|
import keras
|
|
import tensorflow.keras
|
|
# Create Deep Neural Network Architecture
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import Dense, Dropout
|
|
|
|
new_model = False
|
|
if new_model or (not os.path.exists(model_weights_file)):
|
|
print("creating new model")
|
|
model = train_model()
|
|
else:
|
|
model = create_model()
|
|
print("loading existing weights model")
|
|
model.load_weights(model_weights_file)
|
|
|
|
model.summary()
|
|
|
|
|
|
|
|
exit()
|
|
|
|
print(data.isnull().sum() / data.shape[0] * 100)
|
|
|
|
first_loc = data.index.get_loc(data.index[0])
|
|
|
|
print(type(data))
|
|
# print(data.diff())
|
|
for day_diff in range(1, 8): # one to 7 days
|
|
d_day = data.diff(day_diff) / data
|
|
d_day = d_day.add_suffix('-1%')
|
|
print(d_day)
|
|
print(data.diff(2) / data)
|
|
|
|
print(pandas.concat([data, d_day]))
|
|
# print(data["-1"])
|