import re
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues peoples = {'': 71, '': 66, '': 54, '': 50, '': 66, '': 61,
'': 103, '': 31, '': 32, '': 41, '': 33, '': 93, '': 50, '': 108, '': 55,
'': 55, '': 92, '': 56, '': 29, '': 27,
'': 25, '': 25, '': 50, '': 66, '': 68, '': 52, '': 50, '': 50, '': 52, '': 50,
'': 133, '': 166, '': 10, '': 8, '': 99, '': 18,
'': 50, '': 24, '': 19, '': 25, '': 24, '': 24, '': 67, '': 52, '': 67,
'': 67, '': 8, '': 31, '': 82, '': 62, '': 8, '': 104, '': 52, '': 52, '': 47,
'': 56, '': 72, '': 57, '': 36, '': 50, '': 120, '': 50,
'': 56} class AsySpider(object):
def __init__(self, urls, concurrency=10, results=None, **kwargs):
urls.reverse()
self.urls = urls
self.concurrency = concurrency
self._q = queues.Queue()
self._fetching = set()
self._fetched = set()
if results is None:
self.results = [] def fetch(self, url, **kwargs):
fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
return fetch(url, raise_error=False, **kwargs) def handle_html(self, url, html):
"""handle html page"""
print(url) def handle_response(self, url, response):
"""inherit and rewrite this method if necessary"""
if response.code == 200:
self.handle_html(url, response.body) elif response.code == 599: # retry
self._fetching.remove(url)
self._q.put(url) @gen.coroutine
def get_page(self, url):
try:
response = yield self.fetch(url)
# print('######fetched %s' % url)
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return(e)
raise gen.Return(response) @gen.coroutine
def _run(self):
@gen.coroutine
def fetch_url():
current_url = yield self._q.get()
try:
if current_url in self._fetching:
return # print('fetching****** %s' % current_url)
self._fetching.add(current_url) response = yield self.get_page(current_url)
self.handle_response(current_url, response) # handle reponse self._fetched.add(current_url) for i in range(self.concurrency):
if self.urls:
yield self._q.put(self.urls.pop()) finally:
self._q.task_done() @gen.coroutine
def worker():
while True:
yield fetch_url() self._q.put(self.urls.pop()) # add first url # Start workers, then wait for the work queue to be empty.
for _ in range(self.concurrency):
worker() yield self._q.join(timeout=timedelta(seconds=300000))
try:
assert self._fetching == self._fetched
except AssertionError:
print(self._fetching - self._fetched)
print(self._fetched - self._fetching) def run(self):
io_loop = ioloop.IOLoop.current()
io_loop.run_sync(self._run) class MySpider(AsySpider):
def fetch(self, url, **kwargs):
"""重写父类fetch方法"""
cookies_str = 'JSESSIONID=0000n4jBi_dKg91XbtHHQHDeeDL:1b4e17j2v; iPlanetDire' \
'ctoryPro=AQIC5wM2LY4Sfcxu%' \
'2FWPIJWGHttZPiXafd%2B1gowyEoxTmyiY%3D%40AAJTSQACMDE%3D%23'
headers = {
'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'cookie': cookies_str
}
return super(MySpider, self).fetch(
url, headers=headers
) def handle_html(self, url, html):
url += 'qwertyu'
pattern = re.compile('userPhoto&ownerId=(.*)qwertyu')
filename = re.findall(pattern, url)[0]
# 注意把dir修改成你想要存放照片位置.例如C:/picture/
dir = '/home/innovation/文档/pic/'
with open(dir + filename + '.jpg', 'wb') as file:
file.write(html)
file.close() def main():
urls = []
url_pic = 'http://myportal.sxu.edu.cn/attachmentDownload.portal?notUseCache=true&type=userPhoto&ownerId='
for academy in peoples:
for i in range(peoples[academy]):
i += 1
if i < 10:
i = '' + str(i)
elif 100 > i >= 10:
i = '' + str(i)
urls.append(url_pic + '' + academy + str(i))
s = MySpider(urls)
s.run() if __name__ == '__main__':
main()

最新文章

  1. JavaWeb学习笔记——SAX解析
  2. LTE Module User Documentation(翻译12)——X2切换(X2-based handover)
  3. Hadoop和Spark的异同
  4. 【转】如何调试bash脚本
  5. erlang observer工具
  6. 移动端和web端前端UI库—Frozen UI、WeUI、SUI Mobile
  7. .NET 的webservice例子
  8. 【HDOJ】4322 Candy
  9. uCGUI窗口的创建过程分析
  10. (转)在SAE使用Apple Push Notification Service服务开发iOS应用, 实现消息推送
  11. ISO C Random Number Functions
  12. cocos2dx lua
  13. 判断android文件是否加壳
  14. WP8.1开发对图片进行裁剪(截取)一部分
  15. 转-Tomcat 8 安装和配置、优化
  16. phython编写图形界面
  17. 面试之路(13)-android apk之间共享数据的方式以及shareUserId详解
  18. [20190416]exclusive latch测试脚本.txt
  19. 【融云分析】如何实现分布式场景下唯一 ID 生成?
  20. Markdown编辑器开发记录(一):开发的初衷和初期踩的坑

热门文章

  1. logrotate
  2. 如何清除WebBrowser的Cookies
  3. 学习Linux系列--Python资源收集
  4. 图片使用base64展示代码,后台为jfinal
  5. delphi强制WebBrowser控件使用指定版本显示网页
  6. Is the Information Reliable? -POJ2983差分约束
  7. Android 上下文对象
  8. linux块设备驱动之实例
  9. 【前端】JSON.stringfy 和 JSON.parse(待续)
  10. MATLAB地图工具箱学习总结(一)从地图投影说起