生成爬虫

scrapy genspider 爬虫名 网址

打开调试用shell

scrapy shell 网址

主体 stock.py

# -*- coding: utf-8 -*-
import re
from urllib import parse
import scrapy
from stock_spider.items import StockItem class StockSpider(scrapy.Spider):
name = 'stock'
allowed_domains = ['pycs.greedyai.com/'] #域名
start_urls = ['http://pycs.greedyai.com/'] #地址 def parse(self, response):
post_urls= response.xpath("//a/@href").extract() #获取子网址
for post_url in post_urls:
yield scrapy.Request(url=parse.urljoin(response.url,post_url),callback=self.parse_detail,dont_filter=True) #整合成可访问的网址 def parse_detail(self,response):
stock_item= StockItem()
#董事会成员
stock_item['names']=self.get_name(response) #性别
# stock_item['sexs']=self.get_sex(response) #部分人员无性别资料导致后来的list越界 #股票代码
stock_item['codes']=self.get_code(response) #成员职位
stock_item['positions']=self.get_position(response)
yield stock_item def get_name(self,response):
name=response.xpath("//td[@class=\"tc name\"]/a/text()").extract()
return name def get_sex(self,response):
sex_temp = response.xpath("//td[@class=\"intro\"]/text()").extract()
sex_list=[]
for sex_info in sex_temp:
try:
sex=re.findall("男|女",sex_info)[0]
sex_list.append(sex)
except(IndexError): #捕获到该异常,则继续往下读取,因为视频上显示在有用数据前后有一些无效的转义字符
continue
return sex_list def get_code(self,response):
code_temp=response.xpath("/html/body/div[3]/div[1]/div[2]/div[1]/h1/a/@title").extract()
for code_info in code_temp:
code=re.findall("\d+",code_info)
return code def get_position(self,response):
position = response.xpath("//td[@class=\"tl\"]/text()").extract()
return position

main.py

from scrapy.cmdline import execute  #调试用

import sys
import os sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy","crawl","stock"])

items.py

# -*- coding: utf-8 -*-

import scrapy

class StockSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass class StockItem(scrapy.Item): #新添加
names=scrapy.Field()
# sexs=scrapy.Field()
codes=scrapy.Field()
positions=scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import os class StockSpiderPipeline(object):
def process_item(self, item, spider):
return item class StockPipeline(object): #新添加 # 类被加载时创建一个文件
def __init__(self):
self.file=open("executive_prep.csv","a+") # a+有则追加,无则创建 def process_item(self, item, spider): #判断文件是否为空,为空则写入标头:姓名,性别,股票代码,职位
#为空则追加写文件
if os.path.getsize("executive_prep.csv"): #获取文件大小
#开始写文件
self.write_content(item)
else:
self.file.write("姓名,性别,股票代码,职位\n")
self.file.flush() def write_content(self,item): names = item['names']
# sexs = item['sexs']
codes = item['codes']
positions = item['positions']
for i in range(len(names)):
result=names[i]+","+codes[0]+","+positions[i]+"\n"
self.file.write(result)

settings.py

# -*- coding: utf-8 -*-

BOT_NAME = 'stock_spider'

SPIDER_MODULES = ['stock_spider.spiders']
NEWSPIDER_MODULE = 'stock_spider.spiders' #新添加 # Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'stock_spider.pipelines.StockSpiderPipeline': 300,
'stock_spider.pipelines.StockPipeline': 300, #新添加
}

最新文章

  1. ABAP程序互调用:SUBMIT、CALL TRANSACTION、LEAVE TO TRANSACTION
  2. 好的bootstrap文章
  3. Codevs 1229 数字游戏
  4. django"动态网页","动态url","调试方法"
  5. Jquery Mobile 动态添加元素然后刷新 转
  6. 浅谈iOS中的单例模式
  7. 知道创宇研发技能表v2.2
  8. [你必须知道的.NET]第三十二回,,深入.NET 4.0之,Tuple一二
  9. oracle收集
  10. c#保存datagridview中的数据时报错 “动态SQL生成失败。找不到关键信息”
  11. KISSY对vm的抽取
  12. Log4j运用于代码中
  13. Scrapy框架-Spider和CrawlSpider的区别
  14. CentOS7.3安装VirtualBox
  15. 信用评分卡 (part 1 of 7)
  16. ATPCS规则
  17. poj 1236(强连通分量分解模板题)
  18. 嵌入式开发之hi3519---GPIO 驱动
  19. Linux命令学习之xargs命令
  20. NOIP模拟赛16

热门文章

  1. nginx 端口转发 (proxy_pass反向代理)
  2. P1129 [ZJOI2007]矩阵游戏 二分图匹配
  3. Burpsuite 2.0.11 Beta 破解版下载
  4. 微信小程序开发入门教程(四)---自己动手做个小程序
  5. Tomcat怎么关闭日志输出
  6. Unicode 和utf-8的转换以及深拷贝和浅拷贝的区别
  7. Python3 Address already in use 解决方法
  8. CF427D
  9. .net 数据导出
  10. 以下示例使用一个 x,y 坐标列表创建了一个多边形几何对象。然后使用裁剪工具来裁剪具有多边形几何对象的要素类。