import os,sys,datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import pprint
import statsmodels.tsa.stattools as ts
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.metrics import confusion_matrix
from sklearn.qda import QDA
from sklearn.svm import LinearSVC, SVC
def load_stock_data(file_name):
df = pd.read_pickle(file_name)
return df
#create dataset
def make_dataset(df, time_lags=1):
df_lag = pd.DataFrame(index=df.index)
df_lag["Close"] = df["Close"]
df_lag["Volume"] = df["Volume"]
df_lag["Close_Lag%s" % str(time_lags)] = df["Close"].shift(time_lags)
df_lag["Close_Lag%s_Change" % str(time_lags)] = df_lag["Close_Lag%s" % str(time_lags)].pct_change()*100.0
df_lag["Volume_Lag%s" % str(time_lags)] = df["Volume"].shift(time_lags)
df_lag["Volume_Lag%s_Change" % str(time_lags)] = df_lag["Volume_Lag%s" % str(time_lags)].pct_change()*100.0
df_lag["Close_Direction"] = np.sign(df_lag["Close_Lag%s_Change" % str(time_lags)])
df_lag["Volume_Direction"] = np.sign(df_lag["Volume_Lag%s_Change" % str(time_lags)])
return df_lag.dropna(how='any')
#split dataset
def split_dataset(df,input_column_array,output_column,spllit_ratio):
split_date = get_date_by_percent(df.index[0],df.index[df.shape[0]-1],spllit_ratio)
input_data = df[input_column_array]
output_data = df[output_column]
# Create training and test sets
X_train = input_data[input_data.index < split_date]
X_test = input_data[input_data.index >= split_date]
Y_train = output_data[output_data.index < split_date]
Y_test = output_data[output_data.index >= split_date]
return X_train,X_test,Y_train,Y_test
def get_date_by_percent(start_date,end_date,percent):
days = (end_date - start_date).days
target_days = np.trunc(days * percent)
target_date = start_date + datetime.timedelta(days=target_days)
#print days, target_days,target_date
return target_date
#forecasting models
def do_logistic_regression(x_train,y_train):
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
return classifier
def do_random_forest(x_train,y_train):
classifier = RandomForestClassifier()
classifier.fit(x_train, y_train)
return classifier
def do_svm(x_train,y_train):
classifier = SVC()
classifier.fit(x_train, y_train)
return classifier
#operate and evaluate the models
def test_predictor(classifier,x_test,y_test):
pred = classifier.predict(x_test)
hit_count = 0
total_count = len(y_test)
for index in range(total_count):
if (pred[index]) == (y_test[index]):
hit_count = hit_count + 1
hit_ratio = hit_count/total_count
score = classifier.score(x_test, y_test)
#print "hit_count=%s, total=%s, hit_ratio = %s" % (hit_count,total_count,hit_ratio)
return hit_ratio, score
# Output the hit-rate and the confusion matrix for each model
#print("%s\n" % confusion_matrix(pred, y_test))
if __name__ == "__main__":
# Calculate and output the CADF test on the residuals
avg_hit_ratio = 0
for time_lags in range(1,6):
print("- Time Lags=%s" % (time_lags))
for company in ['samsung','hyundai']:
df_company = load_stock_data('%s_2010to2017.csv'%(company))
df_dataset = make_dataset(df_company,time_lags)
X_train,X_test,Y_train,Y_test = split_dataset(df_dataset,["Close_Lag%s"%(time_lags),"Volume_Lag%s"%(time_lags)],"Close_Direction",0.75)
#print X_test
lr_classifier = do_logistic_regression(X_train,Y_train)
lr_hit_ratio, lr_score = test_predictor(lr_classifier,X_test,Y_test)
rf_classifier = do_random_forest(X_train,Y_train)
rf_hit_ratio, rf_score = test_predictor(rf_classifier,X_test,Y_test)
svm_classifier = do_svm(X_train,Y_train)
svm_hit_ratio, svm_score = test_predictor(rf_classifier,X_test,Y_test)
print("%s : Hit Ratio - Logistic Regreesion=%0.2f, RandomForest=%0.2f, SVM=%0.2f" % (company,lr_hit_ratio,rf_hit_ratio,svm_hit_ratio))
- Time Lags=1
samsung : Hit Ratio - Logistic Regreesion=0.54, RandomForest=0.53, SVM=0.53
hyundai : Hit Ratio - Logistic Regreesion=0.47, RandomForest=0.47, SVM=0.47
- Time Lags=2
samsung : Hit Ratio - Logistic Regreesion=0.54, RandomForest=0.49, SVM=0.49
hyundai : Hit Ratio - Logistic Regreesion=0.47, RandomForest=0.47, SVM=0.47
- Time Lags=3
samsung : Hit Ratio - Logistic Regreesion=0.54, RandomForest=0.49, SVM=0.49
hyundai : Hit Ratio - Logistic Regreesion=0.46, RandomForest=0.45, SVM=0.45
- Time Lags=4
samsung : Hit Ratio - Logistic Regreesion=0.54, RandomForest=0.52, SVM=0.52
hyundai : Hit Ratio - Logistic Regreesion=0.47, RandomForest=0.45, SVM=0.45
- Time Lags=5
samsung : Hit Ratio - Logistic Regreesion=0.54, RandomForest=0.48, SVM=0.48
hyundai : Hit Ratio - Logistic Regreesion=0.47, RandomForest=0.40, SVM=0.40
0 Comment to "Calculation of Hit Ratio by Logistic Regreesion, RandomForest, SVM about samsung & hyundai"
Post a Comment