Browse Source

Initial commit

master
lijunhui 7 months ago
commit
6133dcb014
  1. 8
      .idea/.gitignore
  2. 4
      .idea/vcs.xml
  3. 4
      README.txt
  4. 321
      eastmoney.py
  5. 60
      eastmoney_getcompany.py
  6. 85
      import_finance.py
  7. 5692
      stock_code.csv

8
.idea/.gitignore

@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

4
.idea/vcs.xml

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings" defaultProject="true" />
</project>

4
README.txt

@ -0,0 +1,4 @@
1.先到东方财富终端导出股票代码以及股票名称,保存到stock_code中;
2.执行eastmoney_getcompany.py,获得公司名称company.csv;
3.执行eastmoney.py,获得公司详细财务信息company_finance_details.csv,其中由于某些公司css布局会有特殊情况采集不到,会收集到company_error.csv中,需要修改css代码重新爬取或者手动采集;
4.整理好数据后,执行import_finance.py将数据导入数据库中。

321
eastmoney.py

@ -0,0 +1,321 @@
# coding=utf-8
import csv
import time
from selenium import webdriver
MAX_RETRY_TIME = 150
URL = "https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code=%s&color=b#/cwfx"
def get_company_finance(stock_code_list):
browser = webdriver.Firefox()
for stock_code in stock_code_list:
code = stock_code[0]
name = stock_code[1]
company_name = stock_code[2]
stock_code = stock_code[0]
if stock_code[0] == '6':
stock_code = 'SH' + stock_code
elif stock_code[0] == '8' or stock_code[0] == '4':
stock_code = 'BJ' + stock_code
else:
stock_code = 'SZ' + stock_code
url = URL % stock_code
browser.execute_script('window.open("%s")' % url)
retry_time = 0
while True:
try:
if retry_time < MAX_RETRY_TIME:
browser.switch_to.window(browser.window_handles[1])
zcfzb_button = browser.find_elements_by_css_selector('div.section.zcfzb ul:last-child li:nth-child(2)')[0]
lrb_button = browser.find_elements_by_css_selector('div.section.lrb ul:last-child li:nth-child(2)')[0]
browser.execute_script("arguments[0].click();", zcfzb_button)
time.sleep(0.2)
browser.execute_script("arguments[0].click();", lrb_button)
time.sleep(0.2)
break
else:
print("年报按钮获取失败")
return
except:
retry_time += 1
time.sleep(0.2)
flag = 0
retry_time = 0
while True:
try:
if retry_time < MAX_RETRY_TIME:
if "银行" in company_name or "信托" in company_name:
# 资产总额
TotalAssets_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(43) td:nth-child(2) span')[
0].text
TotalAssets_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(43) td:nth-child(3) span')[
0].text
TotalAssets_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(43) td:nth-child(4) span')[
0].text
# 销售收入
SalesProceeds_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(2) span')[0].text
SalesProceeds_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(3) span')[0].text
SalesProceeds_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(4) span')[0].text
# 利润总额
LRZE_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(35) td:nth-child(2) span')[0].text
LRZE_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(35) td:nth-child(3) span')[0].text
LRZE_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(35) td:nth-child(4) span')[0].text
# 净利润
JLR_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(39) td:nth-child(2) span')[0].text
JLR_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(39) td:nth-child(3) span')[0].text
JLR_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(39) td:nth-child(4) span')[0].text
# 负债总计
FZZJ_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(74) td:nth-child(2) span')[
0].text
FZZJ_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(74) td:nth-child(3) span')[
0].text
FZZJ_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(74) td:nth-child(4) span')[
0].text
# 纳税额
Ratal_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(59) td:nth-child(2) span')[
0].text
Ratal_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(59) td:nth-child(3) span')[
0].text
Ratal_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(59) td:nth-child(4) span')[
0].text
break
elif "金融" in company_name or "证券" in company_name or "期货" in company_name:
# 资产总额
TotalAssets_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(44) td:nth-child(2) span')[
0].text
TotalAssets_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(44) td:nth-child(3) span')[
0].text
TotalAssets_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(44) td:nth-child(4) span')[
0].text
# 销售收入
SalesProceeds_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(2) span')[0].text
SalesProceeds_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(3) span')[0].text
SalesProceeds_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(4) span')[0].text
# 利润总额
LRZE_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(40) td:nth-child(2) span')[0].text
LRZE_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(40) td:nth-child(3) span')[0].text
LRZE_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(40) td:nth-child(4) span')[0].text
# 净利润
JLR_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(42) td:nth-child(2) span')[0].text
JLR_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(42) td:nth-child(3) span')[0].text
JLR_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(42) td:nth-child(4) span')[0].text
# 负债总计
FZZJ_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(76) td:nth-child(2) span')[
0].text
FZZJ_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(76) td:nth-child(3) span')[
0].text
FZZJ_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(76) td:nth-child(4) span')[
0].text
# 纳税额
Ratal_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(61) td:nth-child(2) span')[
0].text
Ratal_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(61) td:nth-child(3) span')[
0].text
Ratal_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(61) td:nth-child(4) span')[
0].text
break
elif "保险" in company_name:
# 资产总额
TotalAssets_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(51) td:nth-child(2) span')[
0].text
TotalAssets_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(51) td:nth-child(3) span')[
0].text
TotalAssets_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(51) td:nth-child(4) span')[
0].text
# 销售收入
SalesProceeds_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(2) span')[0].text
SalesProceeds_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(3) span')[0].text
SalesProceeds_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(4) span')[0].text
# 利润总额
LRZE_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(53) td:nth-child(2) span')[0].text
LRZE_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(53) td:nth-child(3) span')[0].text
LRZE_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(53) td:nth-child(4) span')[0].text
# 净利润
JLR_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(57) td:nth-child(2) span')[0].text
JLR_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(57) td:nth-child(3) span')[0].text
JLR_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(57) td:nth-child(4) span')[0].text
# 负债总计
FZZJ_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(98) td:nth-child(2) span')[
0].text
FZZJ_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(98) td:nth-child(3) span')[
0].text
FZZJ_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(98) td:nth-child(4) span')[
0].text
# 纳税额
Ratal_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(72) td:nth-child(2) span')[
0].text
Ratal_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(72) td:nth-child(3) span')[
0].text
Ratal_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(72) td:nth-child(4) span')[
0].text
break
else:
# 资产总额
TotalAssets_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(69) td:nth-child(2) span')[
0].text
TotalAssets_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(69) td:nth-child(3) span')[
0].text
TotalAssets_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(69) td:nth-child(4) span')[
0].text
# 销售收入
SalesProceeds_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(2) span')[
0].text
SalesProceeds_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(3) span')[
0].text
SalesProceeds_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(2) td:nth-child(4) span')[
0].text
# 利润总额
LRZE_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(48) td:nth-child(2) span')[
0].text
LRZE_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(48) td:nth-child(3) span')[
0].text
LRZE_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(48) td:nth-child(4) span')[
0].text
# 净利润
JLR_2022 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(52) td:nth-child(2) span')[
0].text
JLR_2021 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(52) td:nth-child(3) span')[
0].text
JLR_2020 = \
browser.find_elements_by_css_selector('div.lrb_table tr:nth-child(52) td:nth-child(4) span')[
0].text
# 负债总计
FZZJ_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(126) td:nth-child(2) span')[
0].text
FZZJ_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(126) td:nth-child(3) span')[
0].text
FZZJ_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(126) td:nth-child(4) span')[
0].text
# 纳税额
Ratal_2022 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(88) td:nth-child(2) span')[
0].text
Ratal_2021 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(88) td:nth-child(3) span')[
0].text
Ratal_2020 = \
browser.find_elements_by_css_selector('div.zcfzb_table tr:nth-child(88) td:nth-child(4) span')[
0].text
break
else:
print("数据获取失败")
return
except:
if retry_time > 25:
with open('company_error.csv', 'a', newline='') as newfile:
writer = csv.writer(newfile)
company_a = [code,name,company_name]
writer.writerow(company_a)
newfile.close()
flag = 1
break
retry_time += 1
time.sleep(0.2)
if flag != 1:
company_finance_details = {"股票代码":code,"股票名称":name,"公司名称":company_name,
"资产总额2022":TotalAssets_2022,"资产总额2021":TotalAssets_2021,"资产总额2020":TotalAssets_2020,
"销售收入2022":SalesProceeds_2022,"销售收入2021":SalesProceeds_2021,"销售收入2020":SalesProceeds_2020,
"利润总额2022":LRZE_2022,"利润总额2021":LRZE_2021,"利润总额2020":LRZE_2020,
"净利润2022":JLR_2022,"净利润2021":JLR_2021,"净利润2020":JLR_2020,
"负债总计2022":FZZJ_2022,"负债总计2021":FZZJ_2021,"负债总计2020":FZZJ_2020,
"纳税额2022":Ratal_2022,"纳税额2021":Ratal_2021,"纳税额2020":Ratal_2020
}
print(company_finance_details)
company_finance_details_csv = [code,name,company_name,
TotalAssets_2022,TotalAssets_2021,TotalAssets_2020,
SalesProceeds_2022,SalesProceeds_2021,SalesProceeds_2020,
LRZE_2022,LRZE_2021,LRZE_2020,
JLR_2022,JLR_2021,JLR_2020,
FZZJ_2022,FZZJ_2021,FZZJ_2020,
Ratal_2022,Ratal_2021,Ratal_2020]
with open('company_finance_details.csv', 'a', newline='') as newfile:
writer = csv.writer(newfile)
writer.writerow(company_finance_details_csv)
newfile.close()
browser.close()
browser.switch_to.window(browser.window_handles[0])
else:
browser.close()
browser.switch_to.window(browser.window_handles[0])
browser.close()
file = 'company.csv'
with open(file,'r') as file:
code_list = csv.reader(file,delimiter=',')
stock_code_list = []
for code in code_list:
stock_code_list.append(code)
del stock_code_list[0]
get_company_finance(stock_code_list)

60
eastmoney_getcompany.py

@ -0,0 +1,60 @@
import csv
import time
from selenium import webdriver
MAX_RETRY_TIME = 150
URL = "https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code=%s&color=b#/gsgk"
def get_company(stock_code_list):
try:
browser = webdriver.Firefox()
for stock_code in stock_code_list:
code = stock_code[0]
name = stock_code[1]
stock_code = stock_code[0]
if stock_code[0] == '6':
stock_code = 'SH' + stock_code
elif stock_code[0] == '8' or stock_code[0] == '4':
stock_code = 'BJ' + stock_code
else:
stock_code = 'SZ' + stock_code
url = URL % stock_code
browser.execute_script('window.open("%s")' % url)
retry_time = 0
while True:
try:
if retry_time < MAX_RETRY_TIME:
browser.switch_to.window(browser.window_handles[1])
company_name = browser.find_elements_by_css_selector('div.jbzl_table tr:nth-child(1) td')[0].text
break
else:
return "获取失败"
except:
retry_time += 1
time.sleep(0.2)
company_details = {"股票代码":code,"股票名称":name,"公司名称":company_name}
print(company_details)
company_details_csv = [code,name,company_name]
with open('code_2.csv', 'a', newline='') as newfile:
writer = csv.writer(newfile)
writer.writerow(company_details_csv)
newfile.close()
browser.close()
browser.switch_to.window(browser.window_handles[0])
browser.close()
except Exception as e:
print(e)
return "出错啦"
file = 'stock_code.csv'
with open(file,'r') as file:
code_list = csv.reader(file,delimiter=',')
stock_code_list = []
for code in code_list:
stock_code_list.append(code)
del stock_code_list[0]
get_company(stock_code_list)

85
import_finance.py

@ -0,0 +1,85 @@
# coding=utf-8
import csv
import pymysql
# 本地
# db_host = '127.0.0.1'
# db_port = 3306
# db_user = 'root'
# db_password = '123456'
# db_name = 'crawler'
# 测试服
# db_host = '47.112.242.103'
# db_port = 17601
# db_user = 'ccwtdm'
# db_password = 'fhRZLEu562wi23M4QC4iYq615UZEvgeB'
# db_name = 'chace'
# 正式服
db_host = '119.23.173.194'
db_port = 17600
db_user = 'ccwfinances'
db_password = 'CCWfinacial@231215#'
db_name = 'chace'
def turn_num(s):
s_num = 0
if '万亿' in s:
s_num = float(s.replace('万亿','')) * 100000000
elif '亿' in s:
s_num = float(s.replace('亿','')) * 10000
elif '' in s:
s_num = float(s.replace('',''))
return s_num
file = 'company_finance_details.csv'
with open(file,'r') as file:
company_finance_details_list_csv = csv.reader(file,delimiter=',')
company_finance_details_list = []
for company_finance_details_csv in company_finance_details_list_csv:
company_finance_details_list.append(company_finance_details_csv)
del company_finance_details_list[0]
for company_finance_details in company_finance_details_list:
stock_code = company_finance_details[0]
stock_name = company_finance_details[1]
company_name = company_finance_details[2]
del company_finance_details[0:3]
TotalAssets_2022 = turn_num(company_finance_details[0])
TotalAssets_2021 = turn_num(company_finance_details[1])
TotalAssets_2020 = turn_num(company_finance_details[2])
SalesProceeds_2022 = turn_num(company_finance_details[3])
SalesProceeds_2021 = turn_num(company_finance_details[4])
SalesProceeds_2020 = turn_num(company_finance_details[5])
LRZE_2022 = turn_num(company_finance_details[6])
LRZE_2021 = turn_num(company_finance_details[7])
LRZE_2020 = turn_num(company_finance_details[8])
JLR_2022 = turn_num(company_finance_details[9])
JLR_2021 = turn_num(company_finance_details[10])
JLR_2020 = turn_num(company_finance_details[11])
FZZJ_2022 = turn_num(company_finance_details[12])
FZZJ_2021 = turn_num(company_finance_details[13])
FZZJ_2020 = turn_num(company_finance_details[14])
JZC_2022 = TotalAssets_2022 - FZZJ_2022
JZC_2021 = TotalAssets_2021 - FZZJ_2021
JZC_2020 = TotalAssets_2020 - FZZJ_2020
Ratal_2022 = turn_num(company_finance_details[15])
Ratal_2021 = turn_num(company_finance_details[16])
Ratal_2020 = turn_num(company_finance_details[17])
db = pymysql.connect(user=db_user, passwd=db_password, db=db_name, host=db_host, port=int(db_port),
charset="utf8mb4", use_unicode=True)
cursor = db.cursor(pymysql.cursors.DictCursor)
sql = "INSERT INTO ccw_company_finance_details(stock_code,stock_name,company_name,TotalAssets_2022,TotalAssets_2021,TotalAssets_2020,SalesProceeds_2022,SalesProceeds_2021,SalesProceeds_2020,LRZE_2022,LRZE_2021,LRZE_2020,JLR_2022,JLR_2021,JLR_2020,JZC_2022,JZC_2021,JZC_2020,Ratal_2022,Ratal_2021,Ratal_2020,is_delisted) VALUES('%s','%s','%s',%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,0)" % (
stock_code,stock_name,company_name,
TotalAssets_2022,TotalAssets_2021,TotalAssets_2020,
SalesProceeds_2022,SalesProceeds_2021,SalesProceeds_2020,
LRZE_2022,LRZE_2021,LRZE_2020,
JLR_2022,JLR_2021,JLR_2020,
JZC_2022,JZC_2021,JZC_2020,
Ratal_2022,Ratal_2021,Ratal_2020
)
print(sql)
cursor.execute(sql)
db.commit()
db.close()

5692
stock_code.csv

File diff suppressed because it is too large
Loading…
Cancel
Save