开始用scrapy 爬取数据的时候  开始用同步操作始终会报1064  的错误  因为 mysql 语法和导入的字段不兼容

尝试了  n  次之后  开始用  异步爬取  虽然一路报错 但是还是能把数据保存到mysql 数据库里

关于spider:

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
import re
from copy import deepcopy
from ..items import MyspiderItem class TbSpider(scrapy.Spider):
name = 'tb' allowed_domains = []
start_urls = ['http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw=%E6%A1%82%E6%9E%97%E7%94%B5%E5%AD%90%E7%A7%91%E6%8A%80%E5%A4%A7%E5%AD%A6%E5%8C%97%E6%B5%B7%E6%A0%A1%E5%8C%BA&pn=26140',
] def parse(self, response): # 总页面
item = MyspiderItem() all_elements = response.xpath(".//div[@class='i']")
# print(all_elements) for all_element in all_elements:
content = all_element.xpath("./a/text()").extract_first()
content = "".join(content.split())
change = re.compile(r'[\d]+.')
content = change.sub('', content)
item['comment'] = content person = all_element.xpath("./p/text()").extract_first()
person = "".join(person.split())
# 去掉点赞数 评论数
change2 = re.compile(r'点[\d]+回[\d]+')
person = change2.sub('',person)
# 选择日期
change3 = re.compile(r'[\d]?[\d]?-[\d][\d](?=)')
date = change3.findall(person) # 如果为今天则选择时间
change4 = re.compile(r'[\d]?[\d]?:[\d][\d](?=)')
time = change4.findall(person) person = change3.sub('',person)
person = change4.sub('',person) if time ==[]:
item['time'] = date
else:
item['time'] = time item['name'] = person # 增加密码 活跃
item['is_active'] = ''
item['password'] = '' print(item)
yield item # 下一页
next_url ='http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/' + parse.unquote( response.xpath(".//div[@class='bc p']/a/@href").extract_first()) print(next_url)
yield scrapy.Request(
next_url,
callback=self.parse, )

关于  item

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class MyspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
comment = scrapy.Field()
time = scrapy.Field()
name = scrapy.Field()
password = scrapy.Field()
is_active = scrapy.Field()

关于setting

# -*- coding: utf-8 -*-

# Scrapy settings for mySpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'mySpider' SPIDER_MODULES = ['mySpider.spiders']
NEWSPIDER_MODULE = 'mySpider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'mu_ke'
MYSQL_USER = 'root'
MYSQL_PASSWD = 'root' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'mySpider.pipelines.MysqlPipelineTwo':200, } # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

关于  异步的爬取   重点

import pymysql
from twisted.enterprise import adbapi class MysqlPipelineTwo(object):
def __init__(self, dbpool):
self.dbpool = dbpool @classmethod
def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
adbparams = dict(
host='127.0.0.1',
db='mu_ke',
user='root',
password='root',
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# 返回实例化参数
return cls(dbpool) def process_item(self, item, spider):
"""
使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
"""
query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
# 添加异常处理
query.addCallback(self.handle_error) # 处理异常 def do_insert(self, cursor, item):
# 对数据库进行插入操作,并不需要commit,twisted会自动commit
insert_sql = """
insert into login_person(name,password,is_active,comment,time) VALUES(%s,%s,%s,%s,%s)
"""
cursor.execute(insert_sql, (item['name'], item['password'], item['is_active'], item['comment'],
item['time'])) def handle_error(self, failure):
if failure:
# 打印错误信息
print(failure)

最新文章

  1. <<精益创业>>读书笔记
  2. three.js 之旅一
  3. 124. Binary Tree Maximum Path Sum
  4. 宏ut_2pow_remainder
  5. H5上传文件
  6. ubuntu配置jdk脚本以及导致开不了机的解决方案
  7. vb是如何连接数据库的
  8. https证书申请流程和简介
  9. Azure存储账户的日志分析方法
  10. REST API TO MiniProgram 上线WordPress官方插件库
  11. GEM5安装
  12. delphi 线程的应用 和spcomm的应用
  13. [na]二层sw数据交换
  14. How to revert your file&folder by "FOUND.000"
  15. 为android编译libsocket的脚本
  16. Python学习---爬虫学习[scrapy框架初识]
  17. 安装DHCP 服务器 指的是由服务器控制一段IP地址范围,客户机登录服务器时就可以自动获得服务器分配的IP地址和子网掩码
  18. SpringCloud教程 | 第三篇: 服务消费者(Feign)(Finchley版本)
  19. c++ 单例模式 对全局变量的替代
  20. org.apache.hadoop.security.AccessControlException: Permissiondenied: user=liuyingping, access=WRITE,inode="/user/root/output":root:supergroup:drwxr-xr-x

热门文章

  1. leetCode练题——38. Count and Say
  2. python3实现在二叉树中找出和为某一值的所有路径
  3. svn服务器IP/URL地址更换,修改本地的仓库地址
  4. ubuntu中数据迁移的时候出现ImportError错误
  5. Attributes for Slot
  6. 「AHOI2014/JSOI2014」支线剧情
  7. 小白学 Python 爬虫:Selenium 获取某大型电商网站商品信息
  8. C/C++网络编程10——I/O复用服务器端实现select方式
  9. 文本输入框UITextField和UITextView
  10. 从零构建以太坊(Ethereum)智能合约到项目实战——学习笔记10