python 抓取拉勾网攻略

废话不多说，直接上代码，将数据存入Mongdb
import requests

import pymongo

import time

import random

mycon = pymongo.MongoClient('127.0.0.1',27017)  # 建立连接

mydb = mycon['lagou_data']                      # 设置库名

class LaGouSpider():

    def __init__(self,city,kd):

        self.headers = {

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',

            'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',

        }

        self.city = city

        self.max_pn =  1

        self.kd =  kd

    def get_start(self):

        mycol = mydb[self.kd]  # 设置集合名

        url = "https://www.lagou.com/jobs/positionAjax.json?city="+ self.city +"&needAddtionalResult=false"

        for page in range(1,10):

            data = {

                'first': 'true',

                'pn': page,

                'kd': self.kd

            }

            s = requests.Session()

            s.get(url = "https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput=",headers = self.headers)

            cookies = s.cookies

            response = s.post(url=url,data=data,cookies = cookies,headers = self.headers).json()

            content = response.get('content')

            if content:

                result = content['positionResult']['result']

                print('岗位名称:{},所在城市:{},开始抓取第:{}页\n'.format(self.kd,self.city,page))

                for i in result:

                    lagou_data = {}

                    lagou_data['positionName'] = i['positionName']        # 岗位名称

                    lagou_data['companyFullName'] = i['companyFullName']  # 公司全名

                    lagou_data['workYear'] = i['workYear']                # 工作经验要求

                    lagou_data['education'] = i['education']              # 学历要求

                    lagou_data['jobNature'] = i['jobNature']              # 工作性质

                    lagou_data['salary'] = i['salary']                    # 薪资

                    lagou_data['city'] = i['city']                        # 所在城市

                    lagou_data['financeStage'] = i['financeStage']        # 金融阶段

                    lagou_data['industryField'] = i['industryField']      # 经营范围

                    lagou_data['companyShortName'] = i['companyShortName']# 公司简名

                    lagou_data['positionAdvantage'] = i['positionAdvantage']# 岗位优势

                    lagou_data['companySize'] = i['companySize']          # 公司规模

                    lagou_data['companyLabelList'] = i['companyLabelList']# 岗位待遇标签

                    lagou_data['district'] = i['district']                # 所在区域

                    lagou_data['positionLables'] = i['positionLables']    # 技术范围标签

                    lagou_data['firstType'] = i['firstType']              # 岗位类型

                    lagou_data['createTime'] = i['createTime']            # 发布时间

                    print(lagou_data)

                    mycol.insert(lagou_data)

            time.sleep(random.uniform(3,7))                               # 随机休眠

if __name__ == '__main__':

    lagou = LaGouSpider('北京','python')

    lagou.get_start()
简述：拉勾网反爬一般，也就是先获取该搜索页面中的 cookies信息，然后添加到返回的json数据接口中。
巴特西

python 抓取拉勾网攻略

最新文章

热门文章

巴特西

python 抓取拉勾网 攻略

最新文章

热门文章

python 抓取拉勾网攻略