最终版:07_中证网(Plus -Pro).py

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
import os sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 for qq in range(8):
# query = input("【中证网】请输入你想搜索的内容:")
query = '苏州银行' #年份
year = [2014,2015,2016,2017,2018,2019,2020,2021]
#总页数
pages = [2,1,1,1,11,1,19,7] year = year[qq]
pages = pages[qq] if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}') # 创建此文件夹 m = 0
for p in range(1, pages + 1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=&timescope=&timescopecolumn=&orderby=&timeline=={year}' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n') # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss) # print('data:',datalist,len(datalist)) if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹 for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >{year}年,第{p}页,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1 print('----------------------------')
print(f'------\n{year}年,爬取完毕----')
print('----------------------------')

历史优化记录:01_中证网.py

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 query = input("【中证网】请输入你想搜索的内容:")
pages = int(input("要爬取的页数(不小于1):"))
if pages < 1:
exit() url = f'http://search.cs.com.cn/search?channelid=215308&perpage=&templet=&token=12.1462412070719.47&searchword={query}' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all("a") # print(alist) weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href')) # ----------------单页每个文章---------------------------------
m = 0 for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk' # print('New:\n',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:\n',page_a) page_b = page_a.find('section').find_all('p') # print(page_b)
fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/{ii+1}.txt','w+',encoding='utf-8') txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本 fp.close()
print(f'>>{ii+1}成功!')
m = ii+1 # +-+++-----------++++++++++-----多页------++++++++++++----------++++ if pages > 1:
for p in range(pages):
url_s = f"http://search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}" resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all("a") # print(alist) weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href')) # ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk' # print('New:\n',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:\n',page_a) page_b = page_a.find('section').find_all('p') # print(page_b)
fp = open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/{ii + 1 + m}.txt', 'w+', encoding='utf-8') txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本 print(f'>>{ii + 1 + m}成功!')
m = m + ii + 1 fp.close() print('---------------\n>>>爬取完毕<<<')

历史优化记录:02_中证网.py

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 query = input("【中证网】请输入你想搜索的内容:")
pages = int(input("要爬取的页数(不小于1):"))
if pages < 1:
exit() url = f'http://search.cs.com.cn/search?page=1&channelid=215308&searchword={query}' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all("a") # print(alist) weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href')) # ----------------单页每个文章---------------------------------
m = 0 for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk' # print('New:\n',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:\n',page_a) page_b = page_a.find('section').find_all('p') # print(page_b)
fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/0/(2021){ii+1}.txt','w+',encoding='utf-8') txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本 fp.close()
print(f'>>{ii+1}成功!')
m = ii+1 # +-+++-----------++++++++++-----多页------++++++++++++----------++++
# +-+++-----------++++++++++-----多页------++++++++++++----------++++ if pages > 1:
for p in range(pages):
url_s = f"http://search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}" resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all("a") # print(alist) weblist = []
for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href')) # ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk' # print('New:\n',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:\n',page_a) page_b = page_a.find('section').find_all('p') # print(page_b)
fp = open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/0/(2021){ii + 1 + m}.txt', 'w+', encoding='utf-8') txt_list = []
for txt_a in page_b:
# print(txt_a.text)
txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本 print(f'>>{ii + 1 + m}成功!')
m = m + ii + 1 fp.close() print('---------------\n>>>爬取完毕<<<')

历史优化记录:03_中证网.py

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 query = input("【中证网】请输入你想搜索的内容:")
pages = int(input("要爬取的页数(不小于1):"))
if pages < 1:
exit() m = 0
for p in range(1,pages+1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=&timescope=&timescopecolumn=&orderby=&timeline==2021' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n') # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all('a') weblist = [] for a in alist:
if a.get('href')[:5] == "https":
weblist.append(a.get('href'))
# print('weblist==',weblist)
# ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk' # print('New:\n',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:\n',page_a) page_b = page_a.find('section').find_all('p') # print(page_b)
fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/2021/(2021){ii+m+1}.txt','w+',encoding='utf-8') txt_list = []
for txt_a in page_b:
# print('txt_a===',txt_a.text)
txt_list.append(txt_a.text)
print(f'\n-++++++++++++++++++第{ii+1}篇文章++++++++++++++++-\n',txt_list,len(txt_list))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本 # print('-----------------------------------')
print(f'\n> > >{ii+1}成功! < < <')
fp.close()
m=m+len(weblist)+1 print('---------------\n>>>爬取完毕<<<')

历史优化记录:04_中证网(网址筛选问题).py

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 query = input("【中证网】请输入你想搜索的内容:")
pages = int(input("要爬取的页数(不小于1):"))
if pages < 1:
exit() m = 0
for p in range(1,pages+1):
url = f'http://search.cs.com.cn/search?page={pages}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=&timescope=&timescopecolumn=&orderby=&timeline==2020' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n') # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all('a') print('alist:',alist) weblist = [] for a in alist:
if a.get('href')[4:] == "http":
weblist.append(a.get('href')) print('weblist==',weblist) # ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk' # print('New:\n',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:\n',page_a) page_b = page_a.find('section').find_all('p') # print(page_b)
fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/2020/(2020){ii+m+1}.txt','w+',encoding='utf-8') txt_list = []
for txt_a in page_b:
# print('txt_a===',txt_a.text)
txt_list.append(txt_a.text)
print(f'\n-++++++++++++++++++第{ii+1}篇文章++++++++++++++++-\n',txt_list,len(txt_list))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本 # print('-----------------------------------')
print(f'\n> > >{ii+1}成功! < < <')
fp.close()
m=m+len(weblist)+1 print('---------------\n>>>爬取完毕<<<')

历史优化记录:05_中证网.py

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 query = input("【中证网】请输入你想搜索的内容:")
year = int(input('要爬取的年份:'))
pages = int(input("要爬取的页数(不小于1):")) if pages < 1:
exit() m = 0
for p in range(1, pages + 1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=&timescope=&timescopecolumn=&orderby=&timeline=={year}' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n') # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find('tr').find_all('a') # print('alist:', alist) weblist = [] for a in alist:
if a.get('href')[:4] == "http":
weblist.append(a.get('href')) print('weblist==', weblist) # ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp_a = requests.get(url_a, headers=dic_a, )
resp_a.encoding = 'gbk' # print('New:\n',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:\n',page_a) page_b = page_a.find_all('p') # print(page_b)
fp = open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8') txt_list = []
for txt_a in page_b:
# print('txt_a===',txt_a.text)
txt_list.append(txt_a.text)
print(f'\n-++++++++++++++++++第{ii + 1}篇文章++++++++++++++++-\n', txt_list, len(txt_list))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)):
fp.write(txt_list[i] + '\n') # 只包含文本 # print('-----------------------------------')
print(f'\n> > >{ii + 1}成功! < < <')
fp.close()
m = m + len(weblist) + 1 print('---------------\n>>>爬取完毕<<<')

历史优化记录:06_中证网(Plus).py

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
import os sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 # query = input("【中证网】请输入你想搜索的内容:")
query = '交通银行'
year = int(input('要爬取的年份:'))
pages = int(input("要爬取的页数(不小于1):")) if pages < 1:
exit() m = 0
for p in range(1, pages + 1):
url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=&timescope=&timescopecolumn=&orderby=&timeline=={year}' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp) print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n') # print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss) # print('data:',datalist,len(datalist)) if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹 for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >第{p}页,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1 print('----------------------------')
print(f'------\n{year}年,爬取完毕----')
print('----------------------------')

最新文章

  1. mybaits注解
  2. Docker 容器中“TERM environment variable not set.”问题解决
  3. the differences between function and procedure
  4. JS的基础语法
  5. Microsoft .NET Framework 4.0安装时发生严重错误 无法安装
  6. php递归无限极分类
  7. Linux入门(一)常见虚拟机及Linux系统安装、xshell连接虚拟机
  8. linux 驱动入门2
  9. mount挂载和交换分区swap
  10. ASP.NET Core 实战:将 .NET Core 2.0 项目升级到 .NET Core 2.1
  11. Game Engine Architecture 6
  12. Git Bash使用详细教程
  13. Swift与JS的交互
  14. MVC源码分析 - ModelBinder绑定 / 自定义数据绑定
  15. HDU 1611 敌兵布阵 / HRBUST 1794 敌兵布阵(线段树)
  16. Java 发展历史
  17. 虚拟机下安装centos7方法,修改系统语言为简体中文的方法
  18. git之常用命令
  19. react-router-dom实现全局路由登陆拦截
  20. 在MAC上搭建python数据分析开发环境

热门文章

  1. laravel 分页支持搜索功能
  2. tp 5 框架 ajax软删除,回收站,数据恢复
  3. JS类型判断&amp;原型链
  4. e值计算来了
  5. 面向服务开发(SOA)
  6. 无法更新apt镜像源?树莓派安装最新版Debian11(bullseye)arm64位系统步骤
  7. Spring Security实现统一登录与权限控制
  8. vs 2019 社区版 .net core 5.0 之 .net core ef 迁移问题方案
  9. STM32芯片命名规则 | STM32大中小容量芯片之间的差别
  10. systemd --user进程CPU占用高问题分析