pytho线性回归的示例程序。

# ########################
# 线性回归                #
#   计算、误差估计与偏差评估 #
##########################
# 参考: https://pythonprogramming.net/regression-introduction-machine-learning-tutorial/?completed=/machine-learning-tutorial-python-introduction/
import pandas as pd 
import quandl

import numpy as np
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import math
import matplotlib.pyplot as plt

# 获取goole股票历史数据
df = quandl.get('WIKI/GOOGL') # returns dataframe

# print(df.head())

# 取得其中一部分列
df = df[['Adj. Open', 'Adj. High','Adj. Low','Adj. Close','Adj. Volume', ]]

# High-Low PerCenTage - 计算最高-最低的百分比
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100
# open-close percentage change - 计算闭市-开市的增长百分比
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100

# 定义新的数据集
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]

# 增加一列标签
forecast_col = 'Adj. Close'
# 标记其中的空值
df.fillna(-99999, inplace=True) # fill holes inplace with value -99999

# 取得其中一部分列用于预测
forecast_out = int(math.ceil(0.01*len(df))) # len(df) = 3424

# 将 forecast_col('Adj. Close') 向上移动 #forecast_out 行
# 并赋值给 'label' 列
df['label'] = df[forecast_col].shift(-forecast_out)

# 去除NA的列
df.dropna(inplace=True) 

# 定义 X 为特征值, 作为dataframe - 除了label列,
# 然后转换成numpy array
X = np.array(df.drop(['label'],1 ))
# 定义 y 作为对应特征值的标签
y = np.array(df['label'])

# 预处理,即规范化数据。scale X = (X-X_mean)/X_std, 对每个属性/每列来说所有数据都聚集在0附近,方差值为1。
# 参见: https://blog.csdn.net/Dream_angel_Z/article/details/49406573
X = preprocessing.scale(X, axis=0, with_mean=True,with_std=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=-0.2)

# n_jobs : 设置线程数, -1 自动设定
clf = LinearRegression(n_jobs=-1)

# 拟合数据
# 参考: https://ithelp.ithome.com.tw/articles/10206114
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(accuracy)

#for k in ['linear','poly','rbf','sigmoid']:
#    clf = svm.SVR(kernel=k, gamma='scale')
#    clf.fit(X_train, y_train)
#    confidence = clf.score(X_test, y_test)
#    print(k,confidence)

# X 为列,转置为行,便于和y_pred绘制
xfit = np.transpose(X_test)[0]
y_pred = clf.predict(X_test)

# 线性方程的系数
print('Coefficients: \n', clf.coef_)
# 平方差
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score 可解释方差: 1 为最佳预测
print('Variance score: %.2f' % r2_score(y_test, y_pred))

# 画输入点为灰色
plt.scatter(xfit, y_test, color='gray')
# 预测点为绿色
plt.scatter(xfit, y_pred, color='green')

# 显示绘制的图形
plt.show()