爬虫_电影天堂热映电影（xpath）

写了一天才写了不到100行。不过总归是按自己的思路完成了
 import requests

 from lxml import etree

 import time

 BASE = 'http://www.dytt8.net'

 def get_one_page(url):

     headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'}

     try:

         response = requests.get(url, headers=headers)

         response.encoding = response.apparent_encoding

         return response.text

     except:

         return 0

 def parse_one_page_href(html):

     str_hrefs = []

     html_element = etree.HTML(html)

     # //div[@class="co_content8"]/ul/table//a/@href

     hrefs = html_element.xpath('//table[@class="tbspan"]//a/@href')

     for href in hrefs:

         href = BASE + href

         str_hrefs.append(href)

     return str_hrefs

 """

 return

     ['http://www.dytt8.net/html/gndy/dyzz/20180731/57193.html',

      'http://www.dytt8.net/html/gndy/dyzz/20180730/57192.html',

                     ......

      'http://www.dytt8.net/html/gndy/dyzz/20180702/57064.html',

      'http://www.dytt8.net/html/gndy/dyzz/20180630/57056.html']

 """

 def get_all_pages(page_nums):

     hrefs = []

     for index in range(1, page_nums + 1):

         url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_' + str(index) + '.html'

         html = get_one_page(url)

         while html == 0:

             time.sleep(3)

             html = get_one_page(url)

         hrefs.extend(parse_one_page_href(html))

     return hrefs

 def get_detail(page_nums):

     movie = []

     hrefs = get_all_pages(page_nums)

     for href in hrefs:  #href: every page url

         informations = {}

         response = requests.get(href)

         response.encoding = response.apparent_encoding

         html = response.text

         html_element = etree.HTML(html)

         title = html_element.xpath('//font[@color="#07519a"]/text()')[0]

         informations['title'] = title

         image_src = html_element.xpath('//p//img/@src')

         informations['image_src'] = image_src[0]

         download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/a/@href')

         informations['download_url'] = download_url

         texts = html_element.xpath('//div[@id="Zoom"]//p/text()')

         for index, text in enumerate(texts):

             if text.startswith('◎片　　名'):

                 text = text.replace('◎片　　名', '').strip()

                 informations['english_name'] = text

             elif text.startswith('◎产　　地'):

                 text = text.replace('◎产　　地', '').strip()

                 informations['location'] = text

             elif text.startswith('◎上映日期'):

                 text = text.replace('◎上映日期', '').strip()

                 informations['date'] = text

             elif text.startswith('◎片　　长'):

                 text = text.replace('◎片　　长', '').strip()

                 informations['time'] = text

             elif text.startswith('◎导　　演'):

                 text = text.replace('◎导　　演', '').strip()

                 informations['director'] = text

             elif text.startswith('◎主　　演'):

                 text = text.replace('◎主　　演', '').strip()

                 actors = []

                 actors.append(text)

                 for x in range(index+1, len(texts)):

                     actor = texts[x].strip()

                     if texts[x].startswith('◎简　　介'):

                         break

                     actors.append(actor)

                 informations['actors'] = actors

             elif text.startswith('◎简　　介 '):

                 text = text.replace('◎简　　介 ', '').strip()

                 intros = []

                 # intros.append(text)

                 for x in range(index+1, len(texts)):

                     intro = texts[x].strip()

                     if texts[x].startswith('◎获奖情况'):

                         break

                     intros.append(intro)

                 informations['intros'] = intros

         movie.append(informations)

     return movie

 def main():

     page_nums = 1  #

     movie = get_detail(page_nums)

     print(movie)

 if __name__ == '__main__':

     main()
运行结果：（选中的是一部电影，一页中有25部电影，网站里一共有176页）
感受到了代码的魅力了吗
巴特西

爬虫_电影天堂热映电影（xpath）

最新文章

热门文章

巴特西

爬虫_电影天堂 热映电影（xpath）

最新文章

热门文章

爬虫_电影天堂热映电影（xpath）