spider_action

spider from mobile to mobile to mobile

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

# from selenium.webdriver.firefox.options import Options

import time

from time import sleep

import math

import random

import sys

tag_jmtool_list = ['（', '(', '-']

ua_list = []

with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:

    for i in uafile:

        if i.find('Mozilla') > -1:

            ua_list.append(i.replace('\n', '').strip())

ua_list_len_ = len(ua_list) - 1

def extract_name(name_):

    for i in tag_jmtool_list:

        name_ = name_.split(i)[0]

    return name_

target_type_list = ['住宅小区', '写字楼']

target_type_list = ['住宅小区']

target_dic = {}

with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile:

    for i in csvfile:

        l = i.replace(' ', '').replace('\n', '').split('";"')

        if l[0].replace('"', '') in target_type_list:

            type_, city, district, addr, name_ = l

            type_, name_ = type_.replace('"', ''), name_.replace('"', '')

            name_reduction = extract_name(name_)

            if city not in target_dic:

                target_dic[city] = {}

            if district not in target_dic[city]:

                target_dic[city][district] = {}

            if type_ not in target_dic[city][district]:

                target_dic[city][district][type_] = {}

            if name_reduction not in target_dic[city][district]:

                target_dic[city][district][type_][name_reduction] = {}

                target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []

                target_dic[city][district][type_][name_reduction]['history_list'] = []

            target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)

            target_dic[city][district][type_][name_reduction]['history_list'].append(l)

def write_res_html(browser, dir_='baidu_map_html/'):

    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')

    page_source = '%s%s' % (current_url_, browser.page_source)

    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())

    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')

    fo = open(file_name, 'w', encoding='utf-8')

    fo.write(page_source)

    fo.closed

def gen_random_letter():

    return chr(random.randint(97, 122))

def gen_random_num():

    return random.randint(0, 10)

def gen_sougo_pid():

    res_ = ''

    for i in range(1, 17, 1):

        if i in [1, 3, 4, 15]:

            res_ = '%s%s' % (res_, gen_random_letter())

        else:

            res_ = '%s%s' % (res_, gen_random_num())

    return res_

def close_alert(browser, attitude='accept'):

    try:

        sleep(2)

        al = browser.switch_to.alert()

        sleep(1)

        if attitude == 'accept':

            al.accept()

        elif attitude == 'dismiss':

            al.dismiss()

        print(sys._getframe().f_lineno, 'alert-closed-ok')

    except Exception:

        print(sys._getframe().f_lineno, Exception, 'no-alert')

# input_ = '深圳市南山区荟芳园'

def mobile_mobile_pages_html(input_):

    # mobile_emulation = {

    #     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},

    #     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}

    ua_list_index = random.randint(0, ua_list_len_)

    mobile_emulation = {

        "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}

    mobile_emulation['userAgent'] = ua_list[ua_list_index]

    chrome_options = Options()

    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)

    browser = webdriver.Chrome(chrome_options=chrome_options)

    url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid())

    print(url_seed)

    browser.get(url_seed)

    js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')

    browser.execute_script(js)

    xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'

    browser.find_element_by_xpath(xp_newpage).click()

    sleep(2)

    # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'

    # sleep(1)

    # browser.find_element_by_xpath(xp).click()

    close_alert(browser)

    try:

        xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'

        sleep(2)

        close_alert(browser)

        browser.find_element_by_xpath(xp)

    except Exception:

        print(sys._getframe().f_lineno, Exception)

        return

    close_alert(browser)

    if browser.find_element_by_xpath(xp).text.find('全部') == -1:

        return

    res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]

    res_num = int(res_num)

    page_num = 10

    loop_breaker = math.ceil(res_num / page_num)

    close_alert(browser)

    if res_num <= page_num:

        write_res_html(browser)

        browser.quit()

        return

    close_alert(browser)

    xp = '//*[@id="place-widget-placenewlist-showall"]'

    browser.find_element_by_xpath(xp).click()

    write_res_html(browser)

    close_alert(browser)

    js = "window.scrollTo(0,document.body.scrollHeight)"

    browser.execute_script(js)

    sleep(1)

    try:

        xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'

        browser.find_element_by_xpath(xp_newpage).click()

        sleep(1)

    except Exception:

        print(sys._getframe().f_lineno, Exception)

        write_res_html(browser)

        browser.quit()

        return

    for i in range(1, loop_breaker, 1):

        sleep(1)

        try:

            xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'

            sleep(3)

            browser.find_element_by_xpath(xp).click()

        except Exception:

            print(sys._getframe().f_lineno, Exception)

            sleep(10)

            break

        try:

            js = "window.scrollTo(0,document.body.scrollHeight)"

            browser.execute_script(js)

            sleep(1)

        except Exception:

            print(sys._getframe().f_lineno, Exception)

            sleep(10)

        try:

            xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'

            sleep(1)

            print(input_, i)

            browser.find_element_by_xpath(xp_newpage).click()

            write_res_html(browser)

        except Exception:

            print(sys._getframe().f_lineno, Exception)

            sleep(10)

    sleep(2)

    browser.quit()

for city in target_dic:

    for district in target_dic[city]:

        for type_ in target_dic[city][district]:

            for name_reduction in target_dic[city][district][type_]:

                input_ = '%s%s%s' % (city, district, name_reduction)

                mobile_mobile_pages_html(input_)

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

import time

from time import sleep

import math

url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'

mobile_emulation = {

    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},

    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}

chrome_options = Options()

chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)

browser = webdriver.Chrome(chrome_options=chrome_options)

browser.get(url_seed)

input_ = '深圳市南山区荟芳园'

js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')

browser.execute_script(js)

xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'

browser.find_element_by_xpath(xp_newpage).click()

sleep(1)

xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'

browser.find_element_by_xpath(xp).click()

xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'

browser.find_element_by_xpath(xp)

res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]

res_num = int(res_num)

page_num = 10

loop_breaker = math.ceil(res_num / page_num)

def write_res_html(browser, dir_='baidu_map_html/'):

    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')

    page_source = '%s%s' % (current_url_, browser.page_source)

    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())

    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')

    fo = open(file_name, 'w', encoding='utf-8')

    fo.write(page_source)

    fo.closed

xp = '//*[@id="place-widget-placenewlist-showall"]'

browser.find_element_by_xpath(xp).click()

write_res_html(browser)

js = "window.scrollTo(0,document.body.scrollHeight)"

browser.execute_script(js)

sleep(1)

xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'

browser.find_element_by_xpath(xp_newpage).click()

sleep(1)

for i in range(1, loop_breaker, 1):

    sleep(1)

    xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'

    browser.find_element_by_xpath(xp).click()

    js = "window.scrollTo(0,document.body.scrollHeight)"

    browser.execute_script(js)

    sleep(1)

    xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'

    browser.find_element_by_xpath(xp_newpage).click()

    write_res_html(browser)

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'

mobile_emulation = {

    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},

    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}

chrome_options = Options()

chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)

browser = webdriver.Chrome(chrome_options=chrome_options)

browser.get(url_seed)

js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"'

browser.execute_script(js)

xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'

browser.find_element_by_xpath(xp).click()

Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0

import os, sys

import time

import logging

import requests

import threading

from random import choice

from selenium import webdriver

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

"""

全局约定，便于后期做日志分析

os._exit(INT)

4001 4002 4003 4004

"""

os_sep = os.sep

this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[

    -1]

base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2]))

log_abspath = '%s%s%s' % (base_dir, os_sep, 'log')

"""

日志的记录不能依赖于日志类

"""

now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本'

logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e)

with open(logf, 'a') as fo:

    fo.write(s)

    print(s)

try:

    sys.path.append(base_dir)

    from core.utils import MysqlHelper

except Exception as e:

    s = '%s%s%s' % (

        'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())),

        e)

    with open(logf, 'a') as fo:

        fo.write(s)

        print(s)

        os._exit(4001)

try:

    logging.basicConfig(level=logging.INFO,

                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',

                        datefmt='%a, %d %b %Y %H:%M:%S',

                        filename=logf,

                        filemode='a')

except Exception as e:

    s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)

    with open(logf, 'a') as fo:

        fo.write(s)

        print(s)

        os._exit(4002)

try:

    fua, lua = '%s%s%s' % (this_file_abspath, os_sep,

                           'ua_list.txt'), []

    with open(fua, 'r') as fo:

        for i in fo:

            lua.append(i.replace('\n', ''))

except Exception as e:

    s = '%s%s' % ('打开文件 EXCEPTION  ua文件路径： ', fua)

    logging.error(s)

    print(s)

    os._exit(4003)

dcap = dict(DesiredCapabilities.PHANTOMJS)

dcap["phantomjs.page.settings.userAgent"] = choice(lua)

dcap['browserName'], dcap['platform'] = '', ''

class MyThread(threading.Thread):

    def __init__(self, func, args, name):

        threading.Thread.__init__(self)

        self.func, self.args, self.name = func, args, name

    def run(self):

        self.func(self.args)

ctrl_start, max_script_time = time.time(), 3600 * 4

def ctrl_runtime(exit_type=''):

    if time.time() - ctrl_start >= max_script_time:

        s = '%s%s%s%s%s%s%s%s%s' % (

            '程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ',

            threading.get_ident())

        logging.info(s)

        if exit_type == '':

            exit(s)

        elif exit_type == 'sys':

            sys.exit(s)

        elif exit_type == 'os':

            # an integer is required

            # Required argument 'status' (pos 1) not found

            os._exit(4004)

url_counter = 0

def main():

    """

    对异常无限重启

    """

    try:

        mysql_obj = MysqlHelper()

        q = 'SELECT direct_order_id FROM test_error;'

        tuple_l = mysql_obj.select(q)

        pass_id_l = [i[0] for i in tuple_l]

        pass_id_l = [str(i) for i in pass_id_l]

        pass_id_l_s = ','.join(pass_id_l)

        del mysql_obj, tuple_l

        # 业务当前未失效的url在在test_order具有唯一行

        #

        """

        后期任务：

        test_error积累一定数据后对url重新检测

        #3个功能点：当前半个小时、当前未失效的url test_order内url的异常情况（当前的2个功能点）、（后期任务：test_error积累一定数据后对url重新检测）

        q = 'SELECT  url,id FROM test_order WHERE  unix_timestamp(now()) - create_time<=3600*48 AND id NOT in  ( %s )  ORDER BY id DESC ;' % (

            pass_id_l_s)

        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (

            pass_id_l_s)

        """

        mysql_obj = MysqlHelper()

        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (

            pass_id_l_s)

        tuple_l = mysql_obj.select(q)

        del mysql_obj

        if len(tuple_l) == 0:

            s = '无待检测url，程序退出'

            print(s)

            logging.info(s)

    except Exception as e:

        s = '%s%s%s' % ('初始数据，查询数据库异常，无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())))

        print(s)

        logging.warning(s)

        cmd = 'python %s' % (__file__)

        os.system(cmd)

        os._exit(1024)

    # 考虑到每1小时执行下该脚本，对url异常的处理为：第一次请求为预期则终止请求，反之，间隔30后，再至多请求2次，每次间隔10s

    sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [

        'g3user.com', '51g3.com.cn'], 4, 10

    # 重构到基类 where list

    # d当前为为了f_l字段的需求改动

    def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'):

        t = -1

        try:

            mysql_obj = MysqlHelper()

            f_s = ','.join(f_l)

            q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)

            s = '%s%s' % (' DB ', q)

            logging.info(s)

            t = mysql_obj.select(q)

            if t != -1:

                t = t[0]

            del mysql_obj

        except Exception as e:

            s = '%s%s' % (' DB ', e)

            logging.info(s)

            return t

        return t

    def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):

        time.sleep(sleep_seconds)

        global url_counter

        ret = {}

        # db url状态值 状态 0：打不开 1：打开无广告 2:已处理

        ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % (

            time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)

        try:

            if url.find('http') == -1:

                url = '%s%s' % (http_tag, url)

            r = requests.get(url)

            ret['status_code'], txt_pos = int(r.status_code), -1

            s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason)

        except Exception as e:

            ret['ok'] = 0

            s = '%s %s %s' % (s, ' SPIDER ', e)

            logging.error(s)

            print(e, url)

        # 当前，仅考虑目标站返回200

        if ret['status_code'] == 200:

            for ii in mycode_l:

                if r.text.find(ii) > -1:

                    ret['ok'], txt_pos = 1, 1

                    break

            if txt_pos == -1:

                try:

                    driver = webdriver.PhantomJS(desired_capabilities=dcap,

                                                 executable_path='/usr/local/phantomjs/bin/phantomjs')

                    driver.get(url)

                    time.sleep(1)

                    page_source = driver.page_source

                    driver.quit()

                    for ii in mycode_l:

                        if page_source.find(ii) > -1:

                            ret['ok'] = 1

                            break

                    if ret['ok'] == -1:

                        s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')

                        ret['ok'], ret['info'] = 0, s

                except Exception as e:

                    s = '%s %s %s' % (s, ' SPIDER ', e)

                    logging.error(s)

                    print(e, url)

        # elif ret['status_code'] == 403:

        # www.hsdcw.com/fenlei/41668214.html

        elif ret['status_code'] == 403:

            pass

        else:

            ret['ok'], ret['info'] = 0, s

        url_counter += 1

        s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)

        print(s)

        if ret['ok'] == 0:

            logging.warning(s)

        else:

            logging.info(s)

        return ret

    tn, tl, tstep = len(tuple_l), [], 4000

    def tf(ts):

        te = ts + tstep

        te = min(te, tn)

        for i in tuple_l[ts:te]:

            ctrl_runtime(exit_type='os')

            url, chk_id = i

            s = '%s%s%s%s' % (

                time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)

            if chk_id in pass_id_l:

                s = '%s%s' % (s, ' 跳过，之前test_error已写入该url ')

                logging.info(s)

                print(s)

            """

          针对新浪爱问的规则:  不检测

          """

            if url.find('iask.sina.com') > -1:

                continue

            write_db_flag = 1

            for t in range(0, repeat_times, 1):

                ret = chk_exception_url(url, repeat_sleep_times)

                if ret['ok'] == 1:

                    write_db_flag = 0

                    break

            if write_db_flag == 1:

                try:

                    title, uid, money_total = get_onerow(url)

                except Exception as e:

                    s = '%s%s%s' % (s, ' DB Exception-去test_order查', e)

                    logging.info(s)

                    print(s)

                    break

                # 多线程 考虑到原包的 数据库限制，每次均实例化数据库类，用后删除

                try:

                    # 可以考虑分装到类构造器中

                    mysql_obj = MysqlHelper()

                except Exception as e:

                    s = '%s%s%s' % (s, ' DB Exception- ', e)

                    logging.error(s)

                    print(s)

                    break

                """

                多进程、线程并发

                待优化，比如队列

              """

                q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url)

                try:

                    r = mysql_obj.select(q)

                    s = '%s%s%s' % (s, ' -SQL- ', q)

                    logging.info(s)

                    print(q)

                except Exception as e:

                    s = '%s %s %s %s' % (s, ' DB Exception-', q, e)

                    logging.info(s)

                    print(s)

                    break

                ctime = int(time.time())

                # 建议优化此处数据库设计

                db_status = 1 if ret['status_code'] == 200 else 0

                if len(r) == 0:

                    q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (

                        title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id)

                    try:

                        mysql_obj.execute(q)

                        mysql_obj.commit()

                        del mysql_obj

                        s = '%s%s%s' % (s, ' DB SQL ok ', q)

                        logging.info(s)

                        print(s)

                    except Exception as e:

                        s = '%s%s%s%s' % (s, ' DB Exception- ', q, e)

                        logging.error(s)

                        print(s)

                elif len(r) == 1:

                    continue

    for i in range(0, tn, tstep):

        if i >= tn:

            break

        thread_instance = MyThread(tf, (i), tf.__name__)

        tl.append(thread_instance)

    for t in tl:

        t.setDaemon = False

        t.start()

    for t in tl:

        t.join()

if __name__ == '__main__':

    main()

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from  bs4 import BeautifulSoup

ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
    for i in uafile:
        if i.find('Mozilla') > -1:
            ua_list.append(i.replace('\n', '').strip())

ua_list_len_ = len(ua_list) - 1

def close_alert(browser, attitude='accept'):
    # js='alert(window.alert=function(str){return;}'
    # browser.execute_script(js)

    # js= 'window.alert = function(str){return ;}'
    # browser.execute_script(js)
    return

# mobile_emulation = {
#     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
#     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
#     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd'

url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed)

rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3

巴特西

spider_action

最新文章

热门文章