学习的第四个爬虫

  

from lxml import etree
import requests
BASE_D = 'http://www.dytt8.net'
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0"
}
def get_detail_urls(url):
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
detail_urls = map(lambda url: BASE_D+url, detail_urls)
return detail_urls
def parse_detail_page(url):
movie = {}
response = requests.get(url, headers=headers)
text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
# for x in title:
# print(etree.tostring(x,encoding='utf-8').encode('utf-8'))
movie['title'] = title
zoomE = html.xpath("//div[@id='Zoom']")[0]
imgs = zoomE.xpath(".//img/@src")
cover = imgs[0]
# screenshot = imgs[1]
movie['cover'] = cover
#movie['screenshot'] = screenshot def parse_info(info,rule):
return info.replace(rule,"").strip()
infos = zoomE.xpath(".//text()")
for index,info in enumerate(infos):
# print(info)
# print(index)
# print("="*30)
if info.startswith("◎年  代"):
info = parse_info(info, "◎年  代")
movie['year'] = info
elif info.startswith("◎产  地"):
info = parse_info(info, "◎产  地")
movie['country'] = info
elif info.startswith("◎类  别"):
info = parse_info(info, "◎类  别")
movie['category'] = info
elif info.startswith("◎豆瓣评分"):
info = parse_info(info, "◎豆瓣评分")
movie['category'] = info
elif info.startswith("◎片  长"):
info = parse_info(info, "◎片  长")
movie['duration'] = info
elif info.startswith("◎导  演"):
info = parse_info(info, "◎导  演")
movie['director'] = info
elif info.startswith("◎主  演"):
info = parse_info(info, "◎主  演")
actors = [info]
for x in range(index+1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = actors
elif info.startswith("◎简  介 "):
info = parse_info(info, "◎简  介 ")
for x in range(index+1, len(infos)):
profile = infos[x].strip()
movie["profile"] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
movie['download_url'] = download_url
return movie
def spider():
base_url = "http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
movies = []
for x in range(1,8):
# print("="*30)
# print(x)
url = base_url.format(x)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)
if __name__ =='__main__':
spider()

最新文章

  1. java使用poi包将数据写入Excel表格
  2. Installscript如何给自定义路径的变量赋值
  3. WinMTR
  4. JS事件详解
  5. jquery自定义函数的多种方法
  6. sql server 2008 跨服务器查询
  7. Backbone Model——数据模型
  8. iOS学习笔记---oc语言第一天
  9. 用Remastersys定制自己的Ubuntu安装光盘
  10. Hibernate的CRUD
  11. [译]Pandas常用命令对照清单
  12. PHP秒杀系统全方位设计(一)
  13. 2018年html5入门到精通教程电子书百度云盘下载共22本
  14. Vue(二十八)el-cascader 动态加载 - 省市区组件
  15. ajax一些东西
  16. C# 之 static的用法详解
  17. day_8文件的操作
  18. mysql 集群方案
  19. Ymodem协议(参考STM32)
  20. Linux 服务器配置、运行、不用敲命令(新手必备!) - 宝塔全攻略建站一条龙

热门文章

  1. 福大软工1816|K班—alpha冲刺
  2. [LintCode笔记了解一下]39.恢复旋转排序数组
  3. Android之AlterDialog介绍
  4. 301 MovedPermanently 重定向
  5. iOS wkwebview https 加载不受信用的站点
  6. 细说Mammut大数据系统测试环境Docker迁移之路
  7. VSCode提示pylint isnot installed
  8. virtueBox实现虚拟机的复制和粘贴
  9. Pycharm关闭后Python.exe还是在后台运行
  10. JDBC_ResultSet结果集用法_游标原理_关闭连接问题