Python3.6多线程爬虫

Python版本 3.6

简单写一个爬虫,在写的过程熟悉Python语法,不得不说Python用起来真666;

　　代码功能是访问网站首页将所有a标签值作为文件夹,将当前网页所有图片下载对应文件夹中;其实还有很多很多需要修改和完善的地方比如异常,多线程,递归等;以后有机会再说吧.欢迎拍砖

 1 # -*- UTF-8 -*-

 2 from urllib import request

 3 from bs4 import BeautifulSoup

 4 import os

 5 import time, threading

 6

 7

 8 exe_Count = 1

 9 aList = []

10

11 def CallView(url, timeout, directoryPath,exe_count):

12     try:

13         listAvalue = []

14         headers = {

15             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2716.5 Safari/537.36",

16             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"

17         }

18         rep = request.Request(url, headers=headers)

19         response = request.urlopen(rep, timeout=timeout)

20         soup = BeautifulSoup(response)

21         # 获取a标签href 属性并写入list

22         for a in soup.find_all("a"):

23             if a.string is None:

24                 continue

25             if not a.attrs["href"].strip() in aList:

26                 aList.append(a.attrs["href"].strip())

27                 listAvalue.append([a.string.strip()[0:11], a.attrs["href"].strip()])

28             else:

29                 continue

30         # 创建不存在的目录

31         if not os.path.exists(directoryPath):

32             os.mkdir(directoryPath)

33         print("新目录：" + directoryPath)

34         # 开启线程递归

35         thread = threading.Thread(target=ForRequest, args=(listAvalue, timeout, directoryPath,exe_count))

36         thread.start()

37         listImgSrc = []

38         # 获取img标签 并下载

39         for img in soup.find_all("img"):

40             try:

41                 imgSrc = img.attrs["src"]

42                 print(imgSrc)

43                 # 过滤重复src

44                 if not imgSrc in listImgSrc:

45                     listImgSrc.append(imgSrc)

46                     # 读取图片

47                     rep = request.Request(imgSrc)

48                     response = request.urlopen(rep, timeout=timeout)

49                     # 写入图片

50                     filepath = directoryPath + "/" + imgSrc.split('/')[len(imgSrc.split('/')) - 1]

51                     with open(filepath, "wb") as o:

52                         o.write(response.read())

53             except:

54                 print("访问图片或者写入本地Error")

55     except request.HTTPError as e:

56         print(e.code)

57     except:

58         print("CallView Error")

59

60

61 def ForRequest(listA, timeout, directoryPath,exe_count):

62     print("当前已执行：" + str(exe_count) + " 次")

63     #调用次数超过200跳出

64     if  exe_count == 2:

65         thread = threading.current_thread()

66         raise SystemError("正在停止线程")

67     else:

68         exe_count = exe_count + 1

69

70     for info in listA:

71         directoryChildPath = directoryPath + "/" + info[0]

72         if not os.path.exists(directoryChildPath):

73             os.mkdir(directoryChildPath)

74         CallView(info[1], timeout, directoryChildPath, exe_count)

75

76 try:

77     print("爬虫开始活动了")

78     CallView("http://www.xxxxx.com", 5000, "D:/PythonTest/Img/素材公社",exe_Count);

79     print("爬虫正在偷偷活动,不要着急哦！")

80 except:

81     print("Error")

巴特西

Python3.6多线程爬虫

最新文章

热门文章