diff --git a/.gitignore b/.gitignore index a95a4f4..df2e8f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .vscode -**.pdf \ No newline at end of file +**.pdf +**__pycache__** diff --git a/.vscode/settings.json b/.vscode/settings.json index c288c13..be2d89d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,7 @@ { - "python.pythonPath": "/home/jason/miniconda3/envs/quants/bin/python" + "python.pythonPath": "/home/jason/miniconda3/envs/quants/bin/python", + "cSpell.words": [ + "GOOG", + "stdev" + ] } \ No newline at end of file diff --git a/courses/sec2_chap2_deciphering_the_markets_with_technical_analysis.md b/courses/sec2_chap2_deciphering_the_markets_with_technical_analysis.md index aa0d779..5f0b3be 100644 --- a/courses/sec2_chap2_deciphering_the_markets_with_technical_analysis.md +++ b/courses/sec2_chap2_deciphering_the_markets_with_technical_analysis.md @@ -278,3 +278,22 @@ MOM的含义: 时间序列分析因为较为advanced,因此这里暂时略过 +## Conclusion + +* 介绍了产生交易信号的基本概念,如 **阻力线、支撑线** +* 介绍了一些简单的技术指标 +* 介绍了一些更加高级的数学方法 + * Autoregressive (AR) + * Moving Average (MA) + * Differentiation (D) + * AutoCorrelation Function (ACF) + * Partial Autocorrelation Function (PACF) + +## Further Reading + +需要了解更多的概念: + +* stochastic processing +* random walks +* martingales +* time series analysis \ No newline at end of file diff --git a/courses/sec2_chap3_predicting_the_market_with_basic_machine_learning.md b/courses/sec2_chap3_predicting_the_market_with_basic_machine_learning.md new file mode 100644 index 0000000..a58e67e --- /dev/null +++ b/courses/sec2_chap3_predicting_the_market_with_basic_machine_learning.md @@ -0,0 +1,199 @@ +# Chapter 3. Predicting the Markets with Basic Machine Learning + +## 1. 了解术语和符号 + +* **Supervised Learning Problem (有监督学习) vs Unsupervised Learning Problem**: + +> 有监督学习是从标签化训练数据集中推断出函数的机器学习任务 +> +> 无监督学习是一种机器学习的训练方式,它本质上是一个统计手段,在没有标签的数据里可以发现潜在的一些结构的一种训练方式。 +> 它主要具备3个特点: +> * 无监督学习没有明确的目的 +> * 无监督学习不需要给数据打标签 +> * 无监督学习无法量化效果 + +ML在算法交易的领域经常用的是有监督学习 + +* **Regression Problem vs Classification Problem**: + +> 监督学习中,如果预测的变量是离散的,我们称其为**分类**(如决策树,支持向量机等),如果预测的变量是连续的,我们称其为**回归**。 + +* **Training Model & Testing Model** + +使用已由数据来训练模型,这个过程被成为 training model, 得出的模型的 parameter 被称为 **statistical inference of these parametric learning models** + +在训练模型结束后,使用训练好的模型来预测,这个过程被称为 **testing model**, 使用的数据为 **test data** + +* **Performance Metrics (性能指标)** + +得到测试结果后,需要建立 performance metrics 来评估模型 + +* 对回归问题,需要减少 predicted value 和 actual value 之间的**差 residual errors** (i.e. minimize residual errors), 差的计算方法可以是: + * sum of residual errors + * 或 square of residual errors $R^2$ + +### 1.1 Exploring our financial dataset (处理金融数据) + +ML技术需要预处理数据集,可分为3步: + +1. 获取数据 +2. 定义我们想要预测的数据(应变量) +3. 将数据集分为训练数据集和测试数据集 + +#### Step 1: Getting the data + +此处我们仍然使用 GOOG 股价 + +``` + High Low Open Close Volume Adj Close +Date +2004-08-19 51.835709 47.800831 49.813286 49.982655 44871300.0 49.982655 +2004-08-20 54.336334 50.062355 50.316402 53.952770 22942800.0 53.952770 +2004-08-23 56.528118 54.321388 55.168217 54.495735 18342800.0 54.495735 +2004-08-24 55.591629 51.591621 55.412300 52.239193 15319700.0 52.239193 +2004-08-25 53.798351 51.746044 52.284027 52.802086 9232100.0 52.802086 +... ... ... ... ... ... ... +2017-12-22 1064.199951 1059.439941 1061.109985 1060.119995 755100.0 1060.119995 +2017-12-26 1060.119995 1050.199951 1058.069946 1056.739990 760600.0 1056.739990 +2017-12-27 1058.369995 1048.050049 1057.390015 1049.369995 1271900.0 1049.369995 +2017-12-28 1054.750000 1044.770020 1051.599976 1048.140015 837100.0 1048.140015 +2017-12-29 1049.699951 1044.900024 1046.719971 1046.400024 887500.0 1046.400024 + +[3366 rows x 6 columns] +``` + +#### Step 2: Creating objectives (trading conditions that we want to predict) + +获取数据以后,需要创造出需要预测的应变量(response variable); 对GOOG股价,我们会预测 + +1. 若为了预测未来价格,就要预测价格方向(上升、下降、不便),和程度(+10, +3.4, -4); 可用回归 +2. 若为了只预测价格上升下降;可用分类 + +为了处理这两种情况,需要两种trading condition generation 方法: + +* `create_classification_trading_condition(df)`: 如果明天的收盘价高于今天 classification response variable为$+1$;反之为$-1$; +* `create_regression_trading_condition(df)`: 如果明天的收盘价高于今天, classification response variable 为 positive value; 反之则为负值; + +Classification trading condition created: + +``` +( Open-Close High-Low +Date +2004-08-19 -0.169369 4.034878 +2004-08-20 -3.636368 4.273979 +2004-08-23 0.672482 2.206730 +2004-08-24 3.173107 4.000008 +2004-08-25 -0.518059 2.052307 +... ... ... +2017-12-22 0.989990 4.760010 +2017-12-26 1.329956 9.920044 +2017-12-27 8.020020 10.319946 +2017-12-28 3.459961 9.979980 +2017-12-29 0.319946 4.799927 + +[3366 rows x 2 columns], array([ 1, 1, -1, ..., -1, -1, -1])) +``` + +Regression trading condition created: + +``` +( Open-Close High-Low +Date +2004-08-19 -0.169369 4.034878 +2004-08-20 -3.636368 4.273979 +2004-08-23 0.672482 2.206730 +2004-08-24 3.173107 4.000008 +2004-08-25 -0.518059 2.052307 +... ... ... +2017-12-22 0.989990 4.760010 +2017-12-26 1.329956 9.920044 +2017-12-27 8.020020 10.319946 +2017-12-28 3.459961 9.979980 +2017-12-29 0.319946 4.799927 + +[3366 rows x 2 columns], Date +2004-08-19 3.970116 +2004-08-20 0.542965 +2004-08-23 -2.256542 +2004-08-24 0.562893 +2004-08-25 0.951431 + ... +2017-12-22 -3.380005 +2017-12-26 -7.369995 +2017-12-27 -1.229980 +2017-12-28 -1.739990 +2017-12-29 NaN +``` + +#### Step 3: 将数据集分成训练集和测试集 + +一般来说,我们会把已有数据分为几分,然后在一份数据集上进行建模,另一份进行测试 + +## 2. Creating predictive models using linear regression methods 用线性回归来建立预测模型 + +Regression: Linear & Non-linear + +* 线性回归: + * Ordinary Least Squares (OLS) + * Lasso + * Ridge + * Elastic Net +* 非线性回归: + * Decision Tree + +### 2.1 Ordinary Linear Squares (普通最小二乘法) + +OLS: + +* 假设有: + * $y$ 为 $m\times 1$的 target variable + * feature values 有 $m \times$行, 每行 $1\times n$ +* OLS 希望能够发现 + + +OLS 要解决的数学问题$min||X\bullet W - y||^2_2$, 即找出$X\bullet W=y$的最近似方程; + +* $X$ 是 matrix of feature values +* $W$ 是 $n\times 1$ matrix/vector + + +$m=4$ & $n=2$ + +$$ +min +\begin{Vmatrix} +\begin{bmatrix} +x00 & x01\\ +x10 & x11\\ +x20 & x21\\ +x30 & x31 +\end{bmatrix} +\bullet +\begin{bmatrix} +w_0\\ +w_1 +\end{bmatrix} +- +\begin{bmatrix} +y_0\\ +y_1\\ +y_2\\ +y_3 +\end{bmatrix} +\end{Vmatrix}^2_2 +$$ + + + +### 2.2 Regularization and shrinkage + +### 2.3 Decision tree regression + +## 3. Creating predictive models using linear classification methods + +### 3.1 K-nearest Neighbors + +### 3.2 Support Vector Machine + +### 3.3 Logistic Regression + diff --git a/courses/sources/sec2/__init__.py b/courses/sources/sec2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/courses/sources/sec2/ols.py b/courses/sources/sec2/ols.py new file mode 100644 index 0000000..4eb41b9 --- /dev/null +++ b/courses/sources/sec2/ols.py @@ -0,0 +1,55 @@ +from itertools import tee +import pandas as pd +import matplotlib.pyplot as plt +from pandas import isna +from prepare_financial_data import * + +dir_path = os.path.dirname(os.path.realpath(__file__)) + +goog_data = load_financial_data( + start_date = '2001-01-01', + end_date = '2018-01-01', + output_file = 'goog_data_large.pkl' +) + +X, Y = create_regression_trading_condition(goog_data) +X = X[:-1] +Y = Y[:-1] + +goog_data = goog_data.assign(Target=pd.Series(Y, index=goog_data.index)) +print(goog_data) +goog_data = goog_data[:-1] +print(goog_data) + +pd.plotting.scatter_matrix(goog_data[['Open-Close', 'High-Low', 'Target']], diagonal='kde') +plt.savefig(dir_path + "/scatter_matrix.png") + +""" Split 80% of available data into training feature value and target variable; and remaining 20% of dataset into out-sample testing feature value """ + + +X_train, X_test, Y_train, Y_test = create_train_split_group(X,Y,split_ratio=0.8) + + +from sklearn import linear_model +ols = linear_model.LinearRegression() +ols.fit(X_train, Y_train) + +print('Coefficients: ', ols.coef_) + +""" Performance Matrices """ + +from sklearn.metrics import mean_squared_error, r2_score + +# The mean squared error +print("Mean squared error: {}".format(mean_squared_error(Y_train, ols.predict(X_train)))) + +print(Y_test) +print(X_test) + +# Explained variance score: 1 is perfect prediction +# print("Variance score: " + (r2_score(Y_test, ols.predict(X_test)))) + +goog_data['Predicted_Signal'] = ols.predict(X) +goog_data['GOOG_Returns'] = np.log(goog_data['Close']/goog_data['Close'].shift(1)) + +def calculate_return(df, split) \ No newline at end of file diff --git a/courses/sources/sec2/prepare_financial_data.py b/courses/sources/sec2/prepare_financial_data.py new file mode 100644 index 0000000..a6feaaf --- /dev/null +++ b/courses/sources/sec2/prepare_financial_data.py @@ -0,0 +1,39 @@ +import pandas as pd +import os as os +from pandas_datareader import data +from sklearn.model_selection import train_test_split +import numpy as np + +dir_path = os.path.dirname(os.path.realpath(__file__)) + +def load_financial_data(start_date, end_date, output_file): + try: + df = pd.read_pickle(output_file) + print("File data found...reading GOOG data") + except FileNotFoundError: + print("File not found...downloading the GOOG data") + df = data.DataReader('GOOG', 'yahoo', start_date, end_date) + df.to_pickle(output_file) + + return df + +def create_classification_trading_condition(df): + df['Open-Close'] = df.Open - df.Close + df['High-Low'] = df.High - df.Low + df = df.dropna() + X = df[['Open-Close', 'High-Low']] + Y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1) + return (X,Y) + +def create_regression_trading_condition(df): + df['Open-Close'] = df.Open - df.Close + df['High-Low'] = df.High - df.Low + df = df.dropna() + print(df) + X = df[['Open-Close', 'High-Low']] + Y = df['Close'].shift(-1) - df['Close'] + return(X,Y) + +def create_train_split_group(X, Y, split_ratio=0.8): + # Split dataset into two groups + return train_test_split(X, Y, shuffle=False, train_size=split_ratio) \ No newline at end of file diff --git a/courses/sources/sec2/scatter_matrix.png b/courses/sources/sec2/scatter_matrix.png new file mode 100644 index 0000000..a8ab25f Binary files /dev/null and b/courses/sources/sec2/scatter_matrix.png differ