1.下载对应版本的python mysql 模块 我的是:pymssql-2.2.0.dev0-cp36-cp36m-win_amd64.whl

2.手动创建table

create table grilsbase
(
id int primary key auto_increment,
name varchar(50),height varchar(50),bwh varchar(50),title varchar(100),img_upload varchar(100),pc_img_upload varchar(100),
resource_id varchar(50),totals varchar(50),recommend_id varchar(50),date varchar(50),headimg_upload varchar(50),
show_datetime varchar(50),client_show_datetime varchar(50),video_duration varchar(50),free_select varchar(50),
trial_time varchar(50),viewtimes varchar(50),coop_customselect_654 varchar(50),coop_id varchar(50),tag_class varchar(50),
tag_name varchar(50),playerid varchar(50),block_detailid varchar(50),type varchar(50),istop varchar(50)
)

3.实现爬虫代码

导入模块:requests ,os,json,re,Mysqldb

流程:获取数据=>分析数据=>解析数据=>持久化保存

 #coding:utf-8
import requests
import os
import json
import re
import MySQLdb
import threading
#获取数据url
gilsUrl='http://act.vip.xunlei.com/ugirls/js/ugirlsdata.js'
gilsDetailUrl='http://meitu.xunlei.com/detail.html'
gilsImgUrl='http://data.meitu.xunlei.com/data/image/%s/%s'
executor = threading.BoundedSemaphore(10)
regex=re.compile('\/([^\/]*?\.jpg)$')
regexhead=re.compile('\/([^\/]*?)\.jpg$')
class MySQL:
def __init__(self,host,user,pwd,db):
self.host=host
self.user=user
self.db=db
self.pwd=pwd
def GetConnect(self):
if not self.db:
raise(NameError,'没有目标数据库')
self.connect=MySQLdb.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,port=3306,charset='utf8')
cur=self.connect.cursor()
if not cur:
raise(NameError,'数据库访问失败')
else:
return cur
def ExecSql(self,sql):
cur=self.GetConnect()
cur.execute(sql)
self.connect.commit()
self.connect.close()
def ExecQuery(self,sql):
cur=self.GetConnect()
cur.execute(sql)
resList = cur.fetchall()
self.connect.close()
return resList def getGirlsData():
regex=re.compile("var ugirlsData=(.+)")
r=requests.get(gilsUrl)
jsond=regex.findall(r.text)
with open('ugirlsdata.json','w+',encoding='utf-8') as f:
f.write(jsond[0])
#print('写入json成功')
return json.loads(jsond[0]) def getImgName(imgurl):
if(imgurl==''):
return ''
m=regex.findall(imgurl)
if m is None:
return ''
else:
return m[0] if len(m)>0 else '' def getImgNameHead(imgurl):
if(imgurl==''):
return ''
m=regexhead.findall(imgurl)
if m is None:
return ''
else:
return m[0] if len(m)>0 else '' def WriteDB(jsdata):
ms = MySQL(host="192.168.0.108", user="lin", pwd="", db="grils")
for data in jsdata:
sql="insert into grilsbase(\
name,height,bwh,title,img_upload,pc_img_upload,resource_id,totals,recommend_id,\
date,headimg_upload,show_datetime,client_show_datetime,video_duration,free_select,trial_time,\
viewtimes,coop_customselect_654,coop_id,tag_class,tag_name,playerid,block_detailid,type,istop)\
values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % \
(data['name'],data['height'],data['bwh'],data['title'],getImgName(data.get('img_upload','')),data['pc_img_upload'],data['resource_id'],data["totals"],data["recommend_id"], \
data['date'],getImgName(data.get("headimg_upload",'')),data["show_datetime"],data["client_show_datetime"],data["video_duration"],data["free_select"],data["trial_time"], \
data['viewtimes'],data['coop_customselect_654'],data['coop_id'],data.get('tag_class',''),data.get('tag_name',''),data.get('playerid',''),data['block_detailid'],data['type'],data['istop'])
#print(sql)
ms.ExecSql(sql)
print('完成'+data['name']+'数据更新...')
DownImg(data['name'],data["totals"],data['resource_id'],data["headimg_upload"],data["img_upload"]) def DownImg(name,totals,resource_id,headimg_upload,img_upload):
path=creatFile(resource_id)
if headimg_upload.strip()!='':
#os.remove('./pic/'+resource_id+'/'+getImgName(headimg_upload)+'.jpg')
DownImgRun(headimg_upload,path,getImgNameHead(headimg_upload))
if img_upload.strip()!='':
#os.remove('./pic/'+resource_id+'/'+getImgName(img_upload)+'.jpg')
DownImgRun(img_upload,path,getImgNameHead(img_upload))
#print('正在下载'+name+'图片') for i in range(1,int(totals)+1):
url=gilsImgUrl%(resource_id,str(i)+'.jpg')
DownImgRun(url,path,i)
#t=threading.Thread(target=DownImgRun,args={url,path,i})
#t.start()
#t.join() def DownImgRun(url,path,i):
#print(url) r=requests.get(url)
if(r.status_code==200):
with open(path+'/'+str(i)+'.jpg','wb') as fimg:
fimg.write(r.content) def creatFile(dirname):
path='./pic/'+dirname
if os.path.exists(path):
return path
else:
os.makedirs(path)
return path if __name__ == '__main__':
gri=getGirlsData()
WriteDB(gri)

4.运行效果 和结果

最新文章

  1. 使用Packet Sniffer抓包和分析(z-stack协议)
  2. 2015Web前端攻城之路
  3. “display:block-inline形式的Span或Div中添加文字后,导致Span或Div排版掉落、错位”的原因及解决方法
  4. 改数(洛谷 U5398)
  5. 较好的IOS新闻客户端应用源码
  6. CSS基础(02)
  7. [Gauss]HDOJ3976 Electric resistance
  8. Swift - 22 - 循环结构
  9. Web项目中用模板Jsp页面引入所有静态样式脚本文件(js,css等)
  10. Hibernate学习——映射关系
  11. NSInteger到底是什么数据类型
  12. selenium用法详解
  13. JAVA 11初体验
  14. C# 关键字const与readonly的区别
  15. VMware workstation12 密匙
  16. VM 虚拟机网络配置
  17. Js页面刷新前提示-jquery页面刷新事件
  18. 设置oracle主键自增长
  19. PHP 大神的十大优良习惯
  20. MSSQL-SQL SERVER一些使用中的技巧

热门文章

  1. day26-多态、封装、反射
  2. UVA 10382 Watering Grass 贪心+区间覆盖问题
  3. js获取table checkbox选中行的值.mdjs获取table checkbox选中行的
  4. 在MRC模式下使用SDWebImage
  5. ExecutorService线程池submit的使用
  6. Android开发 SeekBar开发记录
  7. CF774L Bars
  8. jmeter遇到的问题:java.net.ConnectException: Connection refused: connect
  9. leetcode-216-组合总和③
  10. APB简介