目前网上有很多关于scrapy的文章,这里我主要介绍一下我在开发中遇到问题及一些技巧:

1,以登录状态去爬取(带cookie)

 -安装内容:

    brew install phantomjs (MAC上)

    pip install selenium

 -代码:  

 from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities dcap = dict(DesiredCapabilities.PHANTOMJS) # PhantomJS也可以对header进行修改
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
)
#通过账号密码获得cookie的函数
def get_cookie_from_aicoin_login(account, password):
browser = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs',desired_capabilities=dcap)
browser.get("https://www.aicoin.net.cn/sign_in")
while 'Sign in to AIcoin' in browser.title:
username = browser.find_element_by_name("user_account")#获得用户名标签
username.clear()
username.send_keys(account)#输入用户名 psd = browser.find_element_by_name("user_password")#获得密码标签
psd.clear()
psd.send_keys(password)#输入密码 code = browser.find_element_by_name("user_verify")#获得验证码标签
code.clear()
code_verify = browser.find_element_by_xpath("//button[@class='verify_code']")#部分页面存在验证码错误,需要再次点击刷新获得新的验证码
code_verify.click()
time.sleep(1)
browser.save_screenshot("aa.png") # 对登录页截屏并保存在本地
code_txt = input("请查看路径下新生成的aa.png,然后输入验证码:") # 查看图片后手动输入验证码
code.send_keys(code_txt)#输入验证码
commit = browser.find_element_by_xpath("//div[@class='sure_btn']/button[@type='submit']") # 获得登录按钮
commit.click()#点击提交按钮
time.sleep(3)
cookie = {}
for elem in browser.get_cookies():
cookie[elem["name"]] = elem["value"]
#返回cookie
if 'AICoin - Leader Of Global Cryptocurrency Tickers Application' in browser.title:#验证是否登录成功,成功后会跳转到首页
return json.dumps(cookie)
else:
return {}

特别提示:当需要爬取动态内容(js加载的内容)时,也会用到PHANTOMJS

运行爬虫(scrapy crawl yourspider)需要到cd到该爬虫主目录下即包含scrapy.cfg的目录; 另外调试的时候可以直接使用scrapy shell yoururl 进行代码测试;

2,递归爬取内容

-在scrapy中对应的spider文件中添加如下代码(下面是代码是爬取股吧的帖子和评论)

 from scrapy.http import Request
from gubaspider.items import PostItem,CommentItem class GubaSpider(scrapy.spiders.Spider):
name = "guba"
allowed_domains = ["eastmoney.com"] start_urls = [
"http://guba.eastmoney.com/default_551215.html"
] def parse(self, response):
tmp_list = [] for i in response.xpath('//ul[@class="newlist"]/li'): title = i.xpath('span/a[2]/text()').extract()[0]
ar_url = i.xpath('span/a[2]/@href').extract()[0]
group = i.xpath('span/a[1]/text()').extract()[0]
comment_sum = i.xpath('cite[2]/text()').extract()[0]
read_sum = i.xpath('cite[1]/text()').extract()[0]
author = i.xpath('cite[3]/a/text()').extract()[0]
tmp_list.append({'title':title,'ar_url':ar_url,'group':group,'comment_sum':comment_sum,'read_sum':read_sum,\
'author':author}) for z in tmp_list:
yield Request('http://guba.eastmoney.com' + z.pop('ar_url'), callback=self.parse_article,meta=z,cookies=get_cookie_from_aicoin_login(user,pwd))#通过第一个页面里爬取到url再爬取并可以携带参数和cookie;callback就是爬取新url的方法 def parse_article(self,response):
title = response.meta['title']
group = response.meta['group']
comment_sum = response.meta['comment_sum']
read_sum = response.meta['read_sum']
author = response.meta['author']
content = response.xpath('//div[@id="zwcontent"]/div[@class="zwcontentmain"]/div[@id="zwconbody"]/div[@class="stockcodec"]').extract()
post_time = self.get_node_value(response.xpath('//div[@id="zwcontent"]/div[@id="zwcontt"]/div[@id="zwconttb"]/div[@class="zwfbtime"]/text()').extract())
if post_time != 0:
post_type = post_time.split(' ')[-1]
post_time = post_time[4:24]
good_sum = self.get_node_value(response.xpath('//div[@id="zwcontent"]/div[@class="zwconbtns clearfix"]/div[@id="zwconbtnsi_z"]/span[@id="zwpraise"]/a/span/text()').extract())
transmit_sum = self.get_node_value(response.xpath(
'//div[@id="zwcontent"]/div[@class="zwconbtns clearfix"]/div[@id="zwconbtnsi_zf"]/a/span/text()').extract())
comments = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitext stockcodec"]/text()').extract() cm_name = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlianame"]/span[@class="zwnick"]/a/text()').extract()
time = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitime"]/text()').extract()
page_info = response.xpath(
'//div[@id="zwlist"]/div[@class="pager talc zwpager"]/span[@id="newspage"]/@data-page').extract() item = PostItem()
item['Author'] = author # 帖子作者称
item['Title'] = title # 帖子标题
item['Content'] = content # 帖子内容
item['PubTime'] = post_time # 发表时间
item['PostWay'] = post_time if post_time==0 else post_type # 发表方式 网页等
item['Url'] = response.url # 帖子地址
item['Group'] = group # 所属贴吧
item['Like'] = good_sum # 点赞数
item['Transmit'] = transmit_sum # 转发数
item['Comment_Num'] = comment_sum # 评论数
item['Tour'] = read_sum # 浏览数 for x in range(len(cm_name)):
if comments[x]==' ':
if comments[x] == ' ':
s = '//div[@id="zwlist"]/div[' + str(
x + 1) + ']/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitext stockcodec"]/img/@title'
s = response.xpath(s).extract()
comment = reduce(lambda x, y: x + '|'+y, s) if len(s) > 0 else ''
else:
comment = comments[x]
else:
comment = comments[x]
cm_list.append({'name':cm_name[x],'time':time[x][4:],'comment':comment})
item['Comments'] = cm_list # 回复内容
yield item#存入DB
if len(page_info)>0:
page_info = page_info[0].split('|')
sumpage = int(int(page_info[1])/int(page_info[2]))+1
for p in range(1,sumpage):
cm_url = 'http://guba.eastmoney.com/'+page_info[0]+str(p+1)+'.html'
yield Request(cm_url,callback=self.parse_comment)#再爬取下一个页面

3,将数据存入mongodb

-pipelines文件中添加自定义的pipeline类:

 import pymongo

 class MongoPipeline(object):

     def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db @classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
) def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db] def close_spider(self, spider):
self.client.close() def process_item(self, item, spider):
collection_name = item.__class__.__name__
self.db[collection_name].insert(dict(item))
return item

-items中定义自己item:

 from scrapy import Item,Field

 class PostItem(Item):
Author = Field() # 帖子作者称
Title = Field() # 帖子标题
Content = Field() # 帖子内容
PubTime = Field() # 发表时间
# Top = Field() # 是否顶
PostWay = Field() # 发表方式 网页等
Url = Field() # 帖子地址
Group = Field() # 所属贴吧
Like = Field() # 点赞数
Transmit = Field() # 转发数
Comment_Num = Field() # 评论数
Tour = Field() # 浏览数
Comments = Field() # 回复内容 class CommentItem(Item):
Url = Field() # url
Comments = Field() # 评论

-settings中添加ITEM_PIPELINES

 ITEM_PIPELINES = {
'gubaspider.pipelines.MongoPipeline': 300,
}

4,添加代理和Agent

-在middlewares中添加你定义的中间件类:

 from user_agents import agents#从一个文件导入全部agent
import random class UserAgentMiddleware(object): def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent#随机agent
request.meta['proxy'] = "http://proxy.yourproxy:8001"#添加代理地址

-在settings中进行中间配置

 DOWNLOADER_MIDDLEWARES = {
'gubaspider.middlewares.UserAgentMiddleware' : 543
}

-user_agents文件包含一个agent列表:


 """ User-Agents """
agents = [
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
"Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
"Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
"Mozilla/2.02E (Win95; U)",
"Mozilla/3.01Gold (Win95; I)",
"Mozilla/4.8 [en] (Windows NT 5.1; U)",
"Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
"HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3",
"Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
]

※ 以上部分代码参考https://github.com/LiuXingMing/SinaSpider


ITEM_PIPELINES

最新文章

  1. JS正则表达式验证账号、手机号、电话和邮箱
  2. Asp.net Mvc模块化开发之“开启模块开发、调试的简单愉快之旅”
  3. oracle查看当前用户权限
  4. 使用EasyUI的插件前需要引入的文件
  5. artTemplate 介绍
  6. XSLT模糊查询函数contains不区分大小写,for-each排序
  7. Linux操作杂记
  8. IIS上的错误与解决方案
  9. 设置单选的listView或者gridview
  10. Char型和string型字符串比较整理
  11. 动态Lambda进阶一
  12. 国外流行的共享网站实现:facebook,twitter,google+1,tumblr等待
  13. Django:之CMDB资源系统
  14. [BZOJ2820][Luogu2257]YY的GCD
  15. HTTP请求协议
  16. html-day06
  17. android--------内存泄露分析工具—Android Monitor
  18. Bootstrap如何关闭弹窗
  19. mysqlsandbox
  20. 利用PHP的debug_backtrace函数,实现PHP文件权限管理、动态加载

热门文章

  1. 蓝桥杯近3年初赛题之三(17年b组)
  2. sql多行多列重复
  3. 判断是否为JSON对象
  4. html5 旋转导航练习
  5. PIL模块
  6. IP地址 0.0.0.0 是什么意思?
  7. PAT (Basic Level) Practice (中文)1008 数组元素循环右移问题 (20 分)
  8. workbench使用小笔记(不定期持续更新)
  9. html5中视频播放问题总结
  10. Josephina and RPG HDU - 4800