从东方财富爬取上市公司财务信息的代码;法律规定每年4月30日前所有上市公司必须公布年度财报,所以在5月之后去爬就行
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

276 lines
19 KiB

7 months ago
# coding=utf-8
import csv
import time
from selenium import webdriver
from scrapy.selector import Selector
7 months ago
MAX_RETRY_TIME = 150
URL = "https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code=%s&color=b#/cwfx"
def get_company_finance(stock_code_list):
browser = webdriver.Firefox()
for stock_code in stock_code_list:
code = stock_code[0]
name = stock_code[1]
company_name = stock_code[2]
stock_code = stock_code[0]
if stock_code[0] == '6':
stock_code = 'SH' + stock_code
elif stock_code[0] == '8' or stock_code[0] == '4':
stock_code = 'BJ' + stock_code
else:
stock_code = 'SZ' + stock_code
url = URL % stock_code
browser.execute_script('window.open("%s")' % url)
retry_time = 0
while True:
try:
if retry_time < MAX_RETRY_TIME:
browser.switch_to.window(browser.window_handles[1])
# 财务报表-资产负债表按钮css
zcfzb_button = browser.find_elements_by_css_selector('div.cwbbTab ul.commonTab li:nth-child(1)')[0]
# 财务报表-资产负债表的年报按钮css
zcfzb_nb_button = browser.find_elements_by_css_selector('div.cwbbTab+div>div.tab ul:last-child li:nth-child(2)')[0]
# 财务报表-利润表按钮css
lrb_button = browser.find_elements_by_css_selector('div.cwbbTab ul.commonTab li:nth-child(2)')[0]
# 财务报表-利润表的年报按钮css
lrb_nb_button = browser.find_elements_by_css_selector('div.cwbbTab ~ div ~ div > div.tab ul:last-child li:nth-child(2)')[0]
# 点击并保存源码
7 months ago
browser.execute_script("arguments[0].click();", zcfzb_button)
time.sleep(0.2)
browser.execute_script("arguments[0].click();", zcfzb_nb_button)
time.sleep(2)
page_source_zcfz = browser.page_source
# 点击并保存源码
7 months ago
browser.execute_script("arguments[0].click();", lrb_button)
time.sleep(0.2)
browser.execute_script("arguments[0].click();", lrb_nb_button)
time.sleep(2)
page_source_lrb = browser.page_source
7 months ago
break
else:
print("年报按钮获取失败")
return
except:
retry_time += 1
time.sleep(0.2)
flag = 0
retry_time = 0
while True:
try:
if retry_time < MAX_RETRY_TIME:
if "银行" in company_name or "信托" in company_name:
# 资产总额(资产总计)
TotalAssets_2024 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(43) td:nth-child(2) span::text").extract_first()
TotalAssets_2023 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(43) td:nth-child(3) span::text").extract_first()
TotalAssets_2022 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(43) td:nth-child(4) span::text").extract_first()
# 销售收入(营业收入)
SalesProceeds_2024 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(2) td:nth-child(2) span::text").extract_first()
SalesProceeds_2023 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(2) td:nth-child(3) span::text").extract_first()
SalesProceeds_2022 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(2) td:nth-child(4) span::text").extract_first()
7 months ago
# 利润总额
LRZE_2024 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(35) td:nth-child(2) span::text").extract_first()
LRZE_2023 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(35) td:nth-child(3) span::text").extract_first()
LRZE_2022 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(35) td:nth-child(4) span::text").extract_first()
7 months ago
# 净利润
JLR_2024 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(39) td:nth-child(2) span::text").extract_first()
JLR_2023 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(39) td:nth-child(3) span::text").extract_first()
JLR_2022 = Selector(text=page_source_lrb).css("div.lrb_table tr:nth-child(39) td:nth-child(4) span::text").extract_first()
# 负债总计(负债合计)
FZZJ_2024 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(74) td:nth-child(2) span::text").extract_first()
FZZJ_2023 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(74) td:nth-child(3) span::text").extract_first()
FZZJ_2022 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(74) td:nth-child(4) span::text").extract_first()
# 纳税额(应交税费)
Ratal_2024 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(59) td:nth-child(2) span::text").extract_first()
Ratal_2023 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(59) td:nth-child(3) span::text").extract_first()
Ratal_2022 = Selector(text=page_source_zcfz).css("div.zcfzb_table tr:nth-child(59) td:nth-child(4) span::text").extract_first()
7 months ago
break
elif "证券" in company_name or "期货" in company_name:
# 资产总额(资产总计)
TotalAssets_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(44) td:nth-child(2) span::text").extract_first()
TotalAssets_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(44) td:nth-child(3) span::text").extract_first()
TotalAssets_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(44) td:nth-child(4) span::text").extract_first()
# 销售收入(营业收入)
SalesProceeds_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(2) span::text").extract_first()
SalesProceeds_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(3) span::text").extract_first()
SalesProceeds_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(4) span::text").extract_first()
7 months ago
# 利润总额
LRZE_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(40) td:nth-child(2) span::text").extract_first()
LRZE_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(40) td:nth-child(3) span::text").extract_first()
LRZE_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(40) td:nth-child(4) span::text").extract_first()
7 months ago
# 净利润
JLR_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(42) td:nth-child(2) span::text").extract_first()
JLR_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(42) td:nth-child(3) span::text").extract_first()
JLR_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(42) td:nth-child(4) span::text").extract_first()
# 负债总计(负债合计)
FZZJ_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(76) td:nth-child(2) span::text").extract_first()
FZZJ_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(76) td:nth-child(3) span::text").extract_first()
FZZJ_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(76) td:nth-child(4) span::text").extract_first()
# 纳税额(应交税费)
Ratal_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(61) td:nth-child(2) span::text").extract_first()
Ratal_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(61) td:nth-child(3) span::text").extract_first()
Ratal_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(61) td:nth-child(4) span::text").extract_first()
7 months ago
break
elif "保险" in company_name:
# 资产总额(资产总计)
TotalAssets_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(51) td:nth-child(2) span::text").extract_first()
TotalAssets_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(51) td:nth-child(3) span::text").extract_first()
TotalAssets_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(51) td:nth-child(4) span::text").extract_first()
# 销售收入(营业收入)
SalesProceeds_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(2) span::text").extract_first()
SalesProceeds_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(3) span::text").extract_first()
SalesProceeds_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(4) span::text").extract_first()
7 months ago
# 利润总额
LRZE_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(53) td:nth-child(2) span::text").extract_first()
LRZE_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(53) td:nth-child(3) span::text").extract_first()
LRZE_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(53) td:nth-child(4) span::text").extract_first()
7 months ago
# 净利润
JLR_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(57) td:nth-child(2) span::text").extract_first()
JLR_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(57) td:nth-child(3) span::text").extract_first()
JLR_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(57) td:nth-child(4) span::text").extract_first()
# 负债总计(负债合计)
FZZJ_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(98) td:nth-child(2) span::text").extract_first()
FZZJ_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(98) td:nth-child(3) span::text").extract_first()
FZZJ_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(98) td:nth-child(4) span::text").extract_first()
# 纳税额(应交税费)
Ratal_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(72) td:nth-child(2) span::text").extract_first()
Ratal_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(72) td:nth-child(3) span::text").extract_first()
Ratal_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(72) td:nth-child(4) span::text").extract_first()
7 months ago
break
else:
# 资产总额(资产总计)
TotalAssets_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(69) td:nth-child(2) span::text").extract_first()
TotalAssets_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(69) td:nth-child(3) span::text").extract_first()
TotalAssets_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(69) td:nth-child(4) span::text").extract_first()
# 销售收入(营业收入)
SalesProceeds_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(2) span::text").extract_first()
SalesProceeds_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(3) span::text").extract_first()
SalesProceeds_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(2) td:nth-child(4) span::text").extract_first()
7 months ago
# 利润总额
LRZE_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(48) td:nth-child(2) span::text").extract_first()
LRZE_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(48) td:nth-child(3) span::text").extract_first()
LRZE_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(48) td:nth-child(4) span::text").extract_first()
7 months ago
# 净利润
JLR_2024 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(52) td:nth-child(2) span::text").extract_first()
JLR_2023 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(52) td:nth-child(3) span::text").extract_first()
JLR_2022 = Selector(text=page_source_lrb).css(
"div.lrb_table tr:nth-child(52) td:nth-child(4) span::text").extract_first()
# 负债总计(负债合计)
FZZJ_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(126) td:nth-child(2) span::text").extract_first()
FZZJ_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(126) td:nth-child(3) span::text").extract_first()
FZZJ_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(126) td:nth-child(4) span::text").extract_first()
# 纳税额(应交税费)
Ratal_2024 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(88) td:nth-child(2) span::text").extract_first()
Ratal_2023 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(88) td:nth-child(3) span::text").extract_first()
Ratal_2022 = Selector(text=page_source_zcfz).css(
"div.zcfzb_table tr:nth-child(88) td:nth-child(4) span::text").extract_first()
7 months ago
break
else:
print("数据获取失败")
return
except:
if retry_time > 25:
with open('company_error.csv', 'a', newline='') as newfile:
writer = csv.writer(newfile)
company_a = [code,name,company_name]
writer.writerow(company_a)
newfile.close()
flag = 1
break
retry_time += 1
time.sleep(0.2)
if flag != 1:
company_finance_details = {"股票代码":code,"股票名称":name,"公司名称":company_name,
"资产总额2024":TotalAssets_2024,"资产总额2023":TotalAssets_2023,"资产总额2022":TotalAssets_2022,
"销售收入2024":SalesProceeds_2024,"销售收入2023":SalesProceeds_2023,"销售收入2022":SalesProceeds_2022,
"利润总额2024":LRZE_2024,"利润总额2023":LRZE_2023,"利润总额2022":LRZE_2022,
"净利润2024":JLR_2024,"净利润2023":JLR_2023,"净利润2022":JLR_2022,
"负债总计2024":FZZJ_2024,"负债总计2023":FZZJ_2023,"负债总计2022":FZZJ_2022,
"纳税额2024":Ratal_2024,"纳税额2023":Ratal_2023,"纳税额2022":Ratal_2022
7 months ago
}
print(company_finance_details)
company_finance_details_csv = [code,name,company_name,
TotalAssets_2024,TotalAssets_2023,TotalAssets_2022,
SalesProceeds_2024,SalesProceeds_2023,SalesProceeds_2022,
LRZE_2024,LRZE_2023,LRZE_2022,
JLR_2024,JLR_2023,JLR_2022,
FZZJ_2024,FZZJ_2023,FZZJ_2022,
Ratal_2024,Ratal_2023,Ratal_2022]
7 months ago
with open('company_finance_details.csv', 'a', newline='') as newfile:
writer = csv.writer(newfile)
writer.writerow(company_finance_details_csv)
newfile.close()
browser.close()
browser.switch_to.window(browser.window_handles[0])
else:
browser.close()
browser.switch_to.window(browser.window_handles[0])
browser.close()
file = 'company.csv'
with open(file,'r') as file:
code_list = csv.reader(file,delimiter=',')
stock_code_list = []
for code in code_list:
stock_code_list.append(code)
get_company_finance(stock_code_list)