first network, but probably not working very well

2020-09-18 20:40:15 +02:00
parent 1a7c5c9dd1
commit 09f577483a
3 changed files with 749 additions and 32 deletions
--- a/main.py
+++ b/main.py
@@ -3,11 +3,14 @@ import os
 import pandas
 import yfinance as yf
 from get_all_tickers.get_tickers import get_tickers
+from tensorflow.keras.callbacks import History
 from pandas import DataFrame

 PROJ_PATH = os.path.dirname(__file__)
 ticker_data_file = os.path.join(PROJ_PATH, "ticker_data_NYSE.hdf")

+model_weights_file = os.path.join(PROJ_PATH, "model_weights.bin")
+

 def main():
    all_tickers = get_tickers(NASDAQ=False, AMEX=False)
@@ -60,6 +63,49 @@ def get_day_x_delta_data(df: DataFrame, day_delta: int = 1, absolute=False) -> D
    return (df.diff(day_delta) * 100 / df).add_suffix('_{}d%'.format(day_delta))


+def create_model():
+    model = Sequential()
+    # exit()  ## exit
+    model.add(Dense(350, input_dim=x_train.shape[1], activation="relu"))
+    # The input_dim =44, since the width of the training data=44 (refer data engg section)
+    model.add(Dense(350, activation="relu"))
+    model.add(Dense(350, activation="relu"))
+    model.add(Dense(350, activation="relu"))
+    model.add(Dense(350, activation="relu"))
+    model.add(Dense(1, activation="linear"))
+    # Configure the model
+    model.compile(optimizer='adam', loss="mean_absolute_error",
+                  metrics=["mean_absolute_error"])
+    return model
+
+
+def train_model(model=None):
+    # Train the model
+    if model is None:
+        model = create_model()
+    history = History()
+
+    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=15, batch_size=64, callbacks=[history])
+
+    # Use the model's evaluate method to predict and evaluate the test datasets
+    result = model.evaluate(x_test.values, y_test.values)
+    # Print the results
+    for i in range(len(model.metrics_names)):
+        print("Metric ", model.metrics_names[i], ":", str(round(result[i], 2)))
+
+    model.save_weights(model_weights_file)
+
+    plt.plot(history.history['loss'])
+    plt.plot(history.history['val_loss'])
+    plt.title("Model's Training & Validation loss across epochs")
+    plt.ylabel('Loss')
+    plt.xlabel('Epochs')
+    plt.legend(['Train', 'Validation'], loc='upper right')
+    plt.show()
+
+    return model
+
+
 if __name__ == '__main__':
    # main()
    try:
@@ -74,17 +120,108 @@ if __name__ == '__main__':
    print(data.index)

    data = data.drop('Volume', axis=1, level=0)  # drop Volume data
+    data = data.drop('Open', axis=1, level=0)  # drop Open data
+    data = data.drop('Low', axis=1, level=0)  # drop Low data
+    data = data.drop('High', axis=1, level=0)  # drop High data
+    data = data.drop('Close', axis=1, level=0)  # drop Close data
+
+    # data.columns = [' '.join(col).strip() for col in data.columns.values]
+    # only keep level 1 (of 0 and 1) columns:
+    data.columns = data.columns.get_level_values(1)
+
    print(data)
    print(data.keys())
    print(data.iloc[0])

-    print(get_day_x_delta_data(data, 1, True))
+    # print(get_day_x_delta_data(data, 1, True))
+    SYM = 'ZTS'
+    data.dropna(subset=[SYM], inplace=True)
    d1 = get_day_x_delta_data(data, 1)
-    print(d1.corr())
+    d7 = get_day_x_delta_data(data, 7)
+    # print(d1.corr())
+
+    print(d1.head())
+
+    # d1.dropna(subset=[f'{SYM}_1d%'], inplace=True)
+    # d7.dropna(subset=[f'{SYM}_7d%'], inplace=True)
+    d1[f'{SYM}_7d%'] = d7[f'{SYM}_7d%']  # add 7 day data to one day relative performance data
+
+    d1.dropna(subset=[f'{SYM}_1d%', f'{SYM}_7d%'], inplace=True)
+    d7.dropna(subset=[f'{SYM}_7d%'], inplace=True)
+
+    d1.fillna(0, inplace=True)
+    d7.fillna(0, inplace=True)
+    print(d1.head(100))
+
+    print(d1.shape)
+
+    import matplotlib.pyplot as plt
+    import seaborn as sns  # Seaborn is another powerful visualization library for Python
+
+    # sns.lineplot(data=data, x='Date', y=f'{SYM}')
+    # sns.lineplot(data=d1, x='Date', y=f'{SYM}_1d%')
+    # plt.show()
+
+    performance = 1
+    for v in d1[f'{SYM}_1d%']:
+        performance *= (1 + (v / 100))
+    print(performance)
+
+    print("Distinct Datatypes:", data.dtypes.unique())
+
+    target = [f'{SYM}_7d%']
+    from sklearn.model_selection import train_test_split
+
+    # Create train and test dataset with an 80:20 split
+    x_train, x_test, y_train, y_test = train_test_split(d1, d7[target], test_size=0.2, random_state=2018)
+    # Further divide training dataset into train and validation dataset with an 90:10 split
+    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=2018)
+
+    print("Shape of x_train:", x_train.shape)
+    print("Shape of x_val:", x_val.shape)
+    print("Shape of x_test:", x_test.shape)
+    print("Shape of y_train:", y_train.shape)
+    print("Shape of y_val:", y_val.shape)
+    print("Shape of y_test:", y_test.shape)
+
+    print(y_test)
+
+    # calculate the average score of the train dataset
+    mean_sales = y_train.mean()
+    print("Average Sales :", mean_sales)
+    # Calculate the Mean Absolute Error on the test dataset
+    print("MAE for Test Data:", abs(y_test - mean_sales).mean())
+
+    import os
+
+    os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
+    os.environ["PLAIDML_EXPERIMENTAL"] = "1"
+    os.environ["PLAIDML_DEVICE_IDS"] = "opencl_amd_gfx1010.0"
+    import plaidml.keras
+
+    plaidml.keras.install_backend()
+    import keras
+    import tensorflow.keras
+    # Create Deep Neural Network Architecture
+    from tensorflow.keras.models import Sequential
+    from tensorflow.keras.layers import Dense, Dropout
+
+    new_model = False
+    if new_model or (not os.path.exists(model_weights_file)):
+        print("creating new model")
+        model = train_model()
+    else:
+        model = create_model()
+        print("loading existing weights model")
+        model.load_weights(model_weights_file)
+
+    model.summary()
+
+

    exit()
-    print(data.isnull().sum() / data.shape[0] * 100)

+    print(data.isnull().sum() / data.shape[0] * 100)

    first_loc = data.index.get_loc(data.index[0])