爬取豆瓣top250部电影

####创建表:
#connect.py
from sqlalchemy import create_engine
# HOSTNAME='localhost'
# PORT='3306'
# USERNAME='root'
# PASSWORD='123456'
# DATABASE='douban' db_url='mysql+pymysql://root:123456@localhost:3306/douban?charset=utf8'
engine=create_engine(db_url)
#创建映像
from sqlalchemy.ext.declarative import declarative_base
Base=declarative_base(engine) #创建会话
from sqlalchemy.orm import sessionmaker
Session=sessionmaker(engine)
session=Session()
##################创建表
from sqlalchemy import Column,String,Integer,DateTime
from datetime import datetime
class Douban(Base):
__tablename__='douban'
id=Column(Integer,primary_key=True,autoincrement=True)
name=Column(String(50))
author=Column(String(100),nullable=True)
actor=Column(String(100))
time=Column(String(50))
country=Column(String(100))
type=Column(String(100))
createtime=Column(DateTime,default=datetime.now)
def __repr__(self):
return '<Douban(id=%s,name=%s,author=%s,actor=%s,time=%s,country=%s,type=%s,createtime=%s)>'%(
self.id,
self.name,
self.author,
self.actor,
self.time,
self.country,
self.type,
self.creatime
)
if __name__=='__main__':
Base.metadata.create_all()
# user=Douban()
# user.type='你好'
# user.country='你'
# user.author='666'
# user.actor='你好啊'
# session.add(user)
# session.commit() ###爬取数据并保存到数据库:
#douban.py
import requests,re
from bs4 import BeautifulSoup
import time,datetime # import pymysql
# conn=pymysql.connect(host='127.0.0.1',user='root',passwd='123456',db='mysql',charset='utf8')
# cur=conn.cursor()
# cur.execute('use douban;')
# cur.execute("insert into douban.douban(author,actor,country) VALUES('aa','bb','bb')")
# conn.commit() #导入sqlalchemy
from connect import Douban,session headers={'Referer':'https://movie.douban.com/explore',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; '
'WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} def get_html(x):
num = 0
for n in range(x+1): url='https://movie.douban.com/top250?start=%s&filter='%(n*25)
html=requests.get(url,headers=headers).text
soup=BeautifulSoup(html,'lxml')
# print(type(soup))
content_all=soup.select('div[class="item"]') for m in content_all:
num+=1
title=m.select('span[class="title"]')[0].string
print(title)
content=m.select('div[class="bd"] > p[class=""]')[0] #返回字符串迭代器
text=content.stripped_strings
li = []
for i in text:
i=str(i)
# print(i) li.append(i)
print(li)
#获取演员和国家列表
author_list=li[0].split('\xa0\xa0\xa0')
country_list=li[1].split('\xa0/\xa0')
# print(author_list)
# print(country_list) #从列表取出数据
author=author_list[0]
actor=author_list[1]
time=country_list[0]
country=country_list[1]
type=country_list[2]
print(author)
print(actor)
print(time)
print(country,type+'\n\n')
print('总共获取%s' % num) #第一种插入方式特别注意,此处用单双引号来区分内容,且%s要加引号,否则会报错
# sql="insert into douban(name,author,actor,time,country,type) VALUES('%s','%s','%s','%s','%s','%s')"%(
# title,
# author,
# actor,
# time,
# country,
# type
# )
# cur.execute(sql)
# conn.commit()
### 第二种插入方法,使用sqlalchemy插入
data=Douban(name=title,
author=author,
actor=actor,
#字符串格式需转换成日期格式
time=time,
# time=datetime.strptime(time,'%Y')
country=country,
type=type,
)
session.add(data)
session.commit() if __name__=='__main__':
x=input('输入数字:')
x=int(x)
get_html(x)
# conn.close()

最新文章

  1. [WPF系列]-Adorner
  2. 编译原理LL1文法分析树(绘图过程)算法实现
  3. 2015GitWebRTC编译实录16
  4. selenium ide 录制回放link链接报错
  5. php版获取重定向后地址的代码分享
  6. rest-work-eat-study-rest-work-eat or rest-rest-work-work-eat-eat..
  7. Java有用的经验--Swing片
  8. 一步步优化JVM五:优化延迟或者响应时间
  9. Linq skip skipwhile take takewhile
  10. jemalloc Mongodb Nginx 优化
  11. java_eclipse添加DID实现自动提示
  12. Android为TV端助力:UDP协议(接收组播和单播)
  13. 《TypeScript 中文入门教程》
  14. Android开发 - 更&quot;聪明&quot;的申请权限方式
  15. 使用w uptime vmstat top sar nload 等命令查看系统负载
  16. (未解决)记录一次登录&amp;jmeter,留下的一地鸡毛
  17. 【jersey】 spring 整合jersey 实现RESTful webservice
  18. weblogic基本安装部署
  19. 宽带、ADSL、以太网、PPPoE
  20. BZOJ4597 SHOI2016随机序列(线段树)

热门文章

  1. poj 3685 矩阵问题 查找第K小的值
  2. 关于Python、Java、C#语言的一些比较
  3. javascript类式继承模式#2——借用构造函数
  4. SQL Server无法连接到数据库
  5. CSS系列(6) CSS通配符详解
  6. Python 爬虫-豆瓣读书
  7. 融合模型Aggregation
  8. Opencv4.0.0安装包
  9. Linux开启MySQL远程连接
  10. android ListView与EditText共存错位