爬虫的编写方法
安装MYSQL,创建数据库
sudo apt install mysql-server
sudo mysql
1
2
sudo打开mysql后创建名为ClusteringFutures的数据库,并创建和授权一般用户。Ctrl+c退出
CREATE DATABASE ClusteringFutures;
USE ClusteringFutures;
CREATE USER ‘IVIVI_PLUS‘@‘localhost‘ IDENTIFIED BY ‘123456‘;
GRANT ALL PRIVILEGES ON ClusteringFutures.* TO ‘IVIVI_PLUS‘@‘localhost‘;
FLUSH PRIVILEGES;
1
2
3
4
5
验证以上操作,以一般用户进入数据库。
mysql -u IVIVI_PLUS -p ClusteringFutures
1
至此,MYSQL准备工作完成。
Python端
基本流程:
1、从新浪财经获取国内商品期货的日K线数据。
2、数据导入MYSQL
3、清洗数据
4、计算单日涨跌幅,作为聚类模型训练的输入
5、SKLearn聚类
6、Matplotlib进行可视化
import requests
import pymysql
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import cluster, covariance, manifold
# 商品期货代码与名称字典
# symbol_dict = {
# ‘TA0‘: ‘PTA‘,
# ‘OI0‘: ‘菜油‘,
# ‘RS0‘: ‘菜籽‘,
# ‘RM0‘: ‘菜粕‘,
# ‘ZC0‘: ‘动力煤‘,
# ‘WH0‘: ‘强麦‘,
# ‘JR0‘: ‘粳稻‘,
# ‘SR0‘: ‘白糖‘,
# ‘CF0‘: ‘棉花‘,
# ‘RI0‘: ‘早籼稻‘,
# ‘MA0‘: ‘甲醇‘,
# ‘FG0‘: ‘玻璃‘,
# ‘LR0‘: ‘晚籼稻‘,
# ‘SF0‘: ‘硅铁‘,
# ‘SM0‘: ‘锰硅‘,
# ‘CY0‘: ‘棉纱‘,
# ‘AP0‘: ‘苹果‘,
# ‘CJ0‘: ‘红枣‘,
# ‘V0‘: ‘PVC‘,
# ‘P0‘: ‘棕榈油‘,
# ‘B0‘: ‘豆二‘,
# ‘M0‘: ‘豆粕‘,
# ‘I0‘: ‘铁矿石‘,
# ‘JD0‘: ‘鸡蛋‘,
# ‘L0‘: ‘塑料‘,
# ‘PP0‘: ‘聚丙烯‘,
# ‘FB0‘: ‘纤维板‘,
# ‘BB0‘: ‘胶合板‘,
# ‘Y0‘: ‘豆油‘,
# ‘C0‘: ‘玉米‘,
# ‘A0‘: ‘豆一‘,
# ‘J0‘: ‘焦炭‘,
# ‘JM0‘: ‘焦煤‘,
# ‘CS0‘: ‘淀粉‘,
# ‘EG0‘: ‘乙二醇‘,
# ‘FU0‘: ‘燃料油‘,
# ‘SC0‘: ‘上海原油‘,
# ‘AL0‘: ‘铝‘,
# ‘RU0‘: ‘天然橡胶‘,
# ‘ZN0‘: ‘沪锌‘,
# ‘CU0‘: ‘铜‘,
# ‘AU0‘: ‘黄金‘,
# ‘RB0‘: ‘螺纹钢‘,
# ‘WR0‘: ‘线材‘,
# ‘PB0‘: ‘铅‘,
# ‘AG0‘: ‘白银‘,
# ‘BU0‘: ‘沥青‘,
# ‘HC0‘: ‘热轧卷板‘,
# ‘SN0‘: ‘锡‘,
# ‘NI0‘: ‘镍‘,
# ‘SP0‘: ‘纸浆‘}
symbol_dict = {
‘TA0‘: ‘PTA‘,
‘OI0‘: ‘菜油‘,
‘RM0‘: ‘菜粕‘,
‘WH0‘: ‘强麦‘,
‘SR0‘: ‘白糖‘,
‘CF0‘: ‘棉花‘,
‘MA0‘: ‘甲醇‘,
‘FG0‘: ‘玻璃‘,
‘SM0‘: ‘锰硅‘,
‘V0‘: ‘PVC‘,
‘P0‘: ‘棕榈油‘,
‘M0‘: ‘豆粕‘,
‘I0‘: ‘铁矿石‘,
‘JD0‘: ‘鸡蛋‘,
‘L0‘: ‘塑料‘,
‘PP0‘: ‘聚丙烯‘,
‘FB0‘: ‘纤维板‘,
‘BB0‘: ‘胶合板‘,
‘Y0‘: ‘豆油‘,
‘C0‘: ‘玉米‘,
‘A0‘: ‘豆一‘,
‘J0‘: ‘焦炭‘,
‘JM0‘: ‘焦煤‘,
‘CS0‘: ‘淀粉‘,
‘AL0‘: ‘铝‘,
‘RU0‘: ‘天然橡胶‘,
‘ZN0‘: ‘沪锌‘,
‘CU0‘: ‘铜‘,
‘AU0‘: ‘黄金‘,
‘RB0‘: ‘螺纹钢‘,
‘PB0‘: ‘铅‘,
‘AG0‘: ‘白银‘,
‘HC0‘: ‘热轧卷板‘}
# 将期货字典的keys和values排序后,分别存入symbol_list和names备用
symbol_list, names = np.array(sorted(symbol_dict.items())).T
# 将symbol从np.array转为list
symbol_list = list(symbol_list)
# 创建期货数据表的函数
def creat_future_table(symbol):
future_code = symbol
# 向新浪财经api请求历史数据
url_str = (‘http://stock2.finance.sina.com.cn/futures/api/json.php/IndexService.getInnerFuturesDailyKLine?symbol=‘ +
future_code)
r = requests.get(url_str)
daily_klines = r.json()
daily_klines_lists = list(daily_klines)
# 数据写入数据库,表格名为期货代码
future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)
cursor = future_db.cursor()
# 如果已存在该期货表格,则删除
cursor.execute("DROP TABLE IF EXISTS " + future_code)
# 创建表格,“交易日期”(主键)、开盘价、最高价、最低价、收盘价、成交量
sql = "CREATE TABLE " + future_code + """ (
price_date DATE NOT NULL PRIMARY KEY,
open_price DECIMAL(19, 4),
high_price DECIMAL(19, 4),
low_price DECIMAL(19, 4),
close_price DECIMAL(19, 4),
volume BIGINT)"""
# 运行创建表格的sql语句
cursor.execute(sql)
# 插入数据(ps:新浪api返回数据转字符串时会多余‘[‘和‘]‘,所以进行切片)
for i in daily_klines_lists:
sql = "INSERT INTO " + future_code + """ (price_date, open_price, high_price, low_price, close_price, volume)
VALUES (""" + str(i)[1:-1] + ")"
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
future_db.commit()
except:
print(‘rollback table %s‘ %s (future_code))
# 如果发生错误则回滚
future_db.rollback()
# 关闭数据库连接
future_db.close()
# 删除表格中垃圾数据行的函数
def delete_rows(symbol):
future_code = symbol
future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)
cursor = future_db.cursor()
# 简单认为,开盘价或收盘价或成交量是0的行为垃圾数据
sql = "DELETE FROM " + future_code + " WHERE open_price = 0 OR close_price = 0 OR volume = 0"
cursor.execute(sql)
future_db.commit()
future_db.close()
# 返回一个表格中,全部交易日期的函数
def select_columns(symbol):
future_code = symbol
future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)
cursor = future_db.cursor()
sql = "SELECT price_date FROM " + future_code
cursor.execute(sql)
r = cursor.fetchall()
future_db.close()
return r
# 查询某期货某一天的某项数据的函数
def select_a_data(symbol, date, data_name):
future_code = symbol
future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)
cursor = future_db.cursor()
sql = "SELECT %s FROM " % data_name + future_code + " WHERE price_date = \‘" + date + "\‘"
cursor.execute(sql)
r = cursor.fetchall()
future_db.close()
return r
print(‘对 %d 只期货进行聚类‘ % len(symbol_list))
# 对symbol_list中的所有期货创建数据表,并删除垃圾行
for s in symbol_list:
print(symbol_dict[s])
creat_future_table(s)
delete_rows(s)
# 找出symbol_list中的所有期货的共有交易日
date_set_list = [] # 交易日组成的集合的列表(为了方便后续的求交集工作,所以转为集合)
for s in symbol_list:
r = select_columns(s)
date_set_list.append(set(r))
com_date_set = date_set_list[0] # 共有交易日集合,初始等于第一支期货的交易日集合
# 对date_set_list中的全部日期集合求交集,获得共有交易日
for i in range(len(symbol_list)):
com_date_set = com_date_set.intersection(date_set_list[i])
com_date_list = list(com_date_set)
com_date_list.sort()
# 将共有交易日以字符串形式保存,方便后续使用
str_com_date_list = []
for i in com_date_list:
str_com_date_list.append(str(i[0]))
# 打印共有交易日天数
print(‘将要聚类的期货共有 %d 条有效交易数据‘ % len(str_com_date_list))
# 将共有交易日的收盘价和开盘价转为np.array格式
# 转置后每一列是同一个期货每个交易日的收盘价或开盘价
# 行数为交易日天数,列数为期货数量
close_prices_list = []
for s in symbol_list:
tmp_list = []
for date in str_com_date_list:
tmp_list.append(select_a_data(s, date, ‘close_price‘)[0][0])
else:
close_prices_list.append(tmp_list)
close_prices = np.array(close_prices_list).T
open_prices_list = []
for s in symbol_list:
tmp_list = []
for date in str_com_date_list:
tmp_list.append(select_a_data(s, date, ‘open_price‘)[0][0])
else:
open_prices_list.append(tmp_list)
open_prices = np.array(open_prices_list).T
# 计算每个期货的每个单日涨跌幅数组作为模型输入
X = (close_prices - open_prices) / open_prices * 100
# 查看输入形状
print(‘验证输入数据形状‘)
print(X.shape)
# 使用sk_learn预设模型进行聚类
edge_model = covariance.GraphicalLassoCV()
edge_model.fit(X)
_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()
# #############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane
# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don‘t control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
n_components=2, eigen_solver=‘dense‘, n_neighbors=6)
embedding = node_position_model.fit_transform(X.T).T
# #############################################################################
# Visualization
plt.rcParams[‘font.sans-serif‘] = [‘TW-Sung‘] # 指定中文字体,解决中文乱码
plt.figure(1, facecolor=‘w‘, figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis(‘off‘)
# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
cmap=plt.cm.nipy_spectral)