Pipeline用法

储存到MongoDB

pipline.py中的代码

import pymongo

class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db @classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
) def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db] def process_item(self, item, spider):
name = item.collection
self.db[name].insert(dict(item))
return item def close_spider(self, spider):
self.client.close()

settings配置

MONGO_URI = 'localhost'  # 若为远程MongoDB,需更改地址
MONGO_DB = 'images360'
ITEM_PIPELINES = {
'images360.pipelines.MongoPipeline': 301,
}

存储到MySQL

pipeline 文件配置

import pymysql

class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port @classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
) def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
port=self.port)
self.cursor = self.db.cursor() def close_spider(self, spider):
self.db.close() def process_item(self, item, spider):
print(item['title'])
data = dict(item)
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values) # 动态构造数据,需先在MySQL里面创建好。
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item

setting配置

MYSQL_HOST = 'localhost'  # 远程连接的话,要改成该主机数据库
MYSQL_DATABASE = 'images360'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root'
MYSQL_PORT = 3306
ITEM_PIPELINES = {
'images360.pipelines.MysqlPipeline': 302,
}

图片储存

pipline

from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item def get_media_requests(self, item, info):
yield Request(item['url'])

settings配置

MYSQL_HOST = 'localhost'  # 远程连接的话,要改成该主机数据库
MYSQL_DATABASE = 'images360'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root'
MYSQL_PORT = 3306
ITEM_PIPELINES = {
'images360.pipelines.ImagePipeline': 300,
}

中间件使用

随机请求头

import random

class RandomUserAgentMiddleware():
def __init__(self):
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2',
'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'ozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0',
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
] def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)

settings配置

DOWNLOADER_MIDDLEWARES = {
'fangtianxia.middlewares.RandomUserAgentMiddleware': 545,
}

设置随机代理池

import logging
import requests class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False def process_request(self, request, spider):
if request.meta.get('retry_times'):
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + uri)
request.meta['proxy'] = uri @classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)

下载中间件配置

DOWNLOADER_MIDDLEWARES = {
'fangtianxia.middlewares.ProxyMiddleware': 543,
}

末尾代理池配置

PROXY_URL = 'http://localhost:5555/random'  # 若为远程服务器运行,需更改地址

对接selenium

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
import time class SeleniumMiddleware():
# def __init__(self, timeout=None, service_args=[]):
def __init__(self, timeout=None):
self.logger = getLogger(__name__)
self.timeout = timeout
# self.browser = webdriver.PhantomJS(service_args=service_args)
self.browser = webdriver.Chrome()
self.browser.set_window_size(1400, 700)
self.browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser, self.timeout) def __del__(self):
self.browser.close() def process_request(self, request, spider):
"""
用PhantomJS抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
self.logger.debug('PhantomJS is Starting')
page = request.meta.get('page', 1)
try:
self.browser.get(request.url)
if page > 1:
input = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
submit = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
input.clear()
input.send_keys(page)
submit.click()
self.wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request) @classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
)
# return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
# service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))

Settings配置

若需要更改为phantomjs,需把注释的内容替换出来,并在setting中增加设置:

SELENIUM_TIMEOUT = 20

PHANTOMJS_SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']

原文链接

最新文章

  1. Oracle读取excel
  2. 服务器三大体系SMP、NUMA、MPP介绍
  3. 最近打算体验一下discuz,有不错的结构化数据插件
  4. 【转载】干货再次来袭!Linux小白最佳实践:《超容易的Linux系统管理入门书》(连载八)用命令实现批量添加用户
  5. Git问题:Cannot update paths and switch to branch 'dev' at the same time.
  6. windows快捷键命令汇总整理
  7. PHP学习笔记二十【静态方法】
  8. Android系统Surface机制的SurfaceFlinger服务对帧缓冲区(Frame Buffer)的管理分析
  9. 我的CSS 入门1
  10. 瀑布流布局使用详解——JQuery插件Isotope(动态实现子项目筛选)
  11. codevs 1081 线段树练习2 (线段树)
  12. UniGUI 如何进行 UniDBGrid 的单元 Cell 的计算 ?
  13. YUI-compressor 在Linux下安装和使用
  14. 怎么修改mysql主键(id)的值为自增
  15. Python3 open()函数参数
  16. 字典序全排列(java实现)
  17. windows python MySQL-python安装过程
  18. Digital image processing(数字图像处理)
  19. std::thread “terminate called without an active exception”
  20. CentOS7——vi编辑保存

热门文章

  1. application.properties在Spring Boot项目中的位置
  2. Docker 持久存储介绍(十三)
  3. VLOOKUP函数 from Excel
  4. 【ARM-Linux开发】Rico Board DIY系列实验教程 Day 2——搭建Boa服务器
  5. .NET Core学习笔记(2)—— WPF使用UWP Custom Control
  6. 工作总结---CTO(张王岩)的笔记--
  7. 《Tsinghua oc mooc》第8~10讲 虚拟内存管理
  8. Word 双栏排版最后多一页空白页删不掉、左栏文字没写完就到右栏了
  9. 5-9 c语言之【文件】
  10. go 数组的定义和赋值