Python 2.7 爬取51job 全国java岗位

一页有50条数据一共2000页分页是get分页

#!/usr/bin/python

# encoding: utf-8

import requests

import threading

from lxml import etree

import sys

import os

import datetime

import re

import random

import time

reload(sys)

sys.setdefaultencoding('utf-8')

# 定义写入日志的方法

def log(context):

    txtName = "./log/log.txt"

    f=file(txtName, "a+")

    f.writelines(context+"\n") 

    f.close()

def xin():

        # 请求头

        header = {

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',

            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

            'Accept-Encoding': 'gzip, deflate, br',

            'Accept-Language': 'zh-CN,zh;q=0.9'

        }

        count=1

           # 一共2000页

        while (count < 2000):

            url="https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,"+str(count)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="

            response=requests.get(url,headers=header)

            html=response.content.decode("gbk")

            print(html)

            selector=etree.HTML(html)

            contents = selector.xpath('//div[@class="dw_table"]/div[@class="el"]')

            log("第"+str(count)+"页了--"+str(len(contents))+"条数据")

            for eachlink in contents:

                    company = eachlink.xpath('span[@class="t2"]/a/text()')[0]

                    url= eachlink.xpath('p/span/a/@href')[0]

                    name= eachlink.xpath('p/span/a/text()')[0]

                    city= eachlink.xpath('span[@class="t3"]/text()')[0]

                    # 工资有的是没有的

                    key=""

                    if len(eachlink.xpath('span[@class="t4"]/text()'))<1:

                        key=""

                    else:

                        key= eachlink.xpath('span[@class="t4"]/text()')[0]

                    # 把空格去掉

                    company=company.replace(' ','')

                    name=name.replace(' ','')

                    city=city.replace(' ','')

                    zhi=name+"============="+company+"============="+city+"============="+str(key)+"============="+url

                    txtName = "./file/java.txt"

                    f=file(txtName, "a+")

                    f.write(zhi)

                    f.close()

            sui=random.randint(1,5)

            log("休眠"+str(sui))

            time.sleep(sui)

            count=count+1     

if __name__=="__main__":

    xin()

日志文件

爬去的数据

但是爬去的速度有点慢,

于是乎采用了多线程爬去,

但是51job 立刻就把IP段给封掉了,

于是用户4台服务器,每台爬取500条数据,最后再结合一起加到数据库中

人生苦短,我用Python!!!

巴特西

Python 2.7 爬取51job 全国java岗位

最新文章

热门文章