昨晚没事写的爬取腾讯新闻代码,在此贴出,可以参考完善。

# -*- coding: utf-8 -*-
import json from scrapy import Spider
from scrapy.http import Request
from scrapy.http import Response
from scrapy.http import FormRequest
from scrapy.selector import Selector
from bs4 import BeautifulSoup from ..items import NewsItem TencentNewsUrl = 'https://pacaio.match.qq.com/irs/rcd' # 要闻 https://pacaio.match.qq.com/pc/topNews?callback=__jp0
# https://pacaio.match.qq.com/irs/rcd?cid=108&ext=&token=349ee24cdf9327a050ddad8c166bd3e3&page=1&expIds=&callback=__jp1
# https://new.qq.com/cmsn/20180726/20180726A0QOLA00
# https://new.qq.com/ omn/20180726/20180726A0QOLA.html class TencentSpider(Spider):
name = 'tencent' def start_requests(self):
# yield Request(
# url='https://pacaio.match.qq.com/pc/topNews?callback=__jp0',
# callback=self.parse_contents
# ) yield FormRequest(
url=TencentNewsUrl,
formdata={
"cid": "58",
"token": "c232b098ee7611faeffc46409e836360",
"ext": "milite",
"page": "0",
"expIds": "",
"callback": "__jp0"
},
callback=self.parse_contents,
meta={
"page": "0",
"field": ""
}
) def parse_contents(self, response: Response):
try:
data = json.load(response.text)
except Exception:
data = json.loads(response.text[(response.text.find('(') + 1):response.text.rfind(')')]) # 处理分离网页
try:
data = data['data']
except Exception:
pass
for url in data:
omn = url['vurl']
if omn.endswith('00') and '/cmsn/' in omn:
omn = omn.replace('/cmsn/', '/omn/')
omn = omn[:omn.rfind('00')] + '.html'
print(omn)
yield Request(
url=omn,
callback=self.parse_news
)
break def parse_news(self, response: Response):
news = NewsItem()
news['url'] = response.url
soup = BeautifulSoup(response.text, "lxml")
news['title'] = soup.find('div', class_='LEFT').h1.text
news['content'] = ''
article = soup.find_all('p', class_='one-p')
for sentence in article:
news['content'] += sentence.text
return news

  

最新文章

  1. MySQL Information Functions
  2. 把字符转换为 HTML 实体
  3. java高薪之路__010_设计模式
  4. [转]How to insert a row between two rows in an existing excel with HSSF (Apache POI)
  5. HDU 4435 charge-station bfs图论问题
  6. UITextField限制字数的方法
  7. apicloud+融云实现即时通讯
  8. oc总结
  9. Oracle 学习笔记(一)Oracle的基本介绍与语法
  10. Lucene——Field.Store(存储域选项)及Field.Index(索引选项)
  11. jQuery获取或设置元素的宽度和高度
  12. Python运算符——复合运算符
  13. [mybatis]Example的用法-转
  14. [C语言]易错知识点、小知识点复习(1)
  15. Hdoj 1856.More is better 题解
  16. tableView 分割线的处理
  17. Gravitee.io docker-compose运行
  18. windows下如何查看端口,关闭端口,开启端口
  19. AngularJS路由系列(6)-- UI-Router的嵌套State
  20. Hadoop学习之路(二十四)YARN的资源调度

热门文章

  1. 【VS开发】windows下的signal
  2. How George Washington Angered Lawmakers Over Thanksgiving——VOA慢速英语
  3. cmake vs qmake
  4. 《ucore lab8》实验报告
  5. 关于time_wait状态的理解
  6. 哈希--Hash,“散列”/“哈希”
  7. [.Net Core] - 当 .Net Core 版本由 1.x 升级至 2.x 后,Cookie 使用方式变更
  8. 下载安装Git,学习笔记
  9. 计算机网络自顶向下方法第3章-传输层 (Transport Layer).2
  10. Manthan, Codefest 19 (open for everyone, rated, Div. 1 + Div. 2) (1208F,1208G,1208H)