# -*- coding: utf-8 -*-
from tweepy import OAuthHandler
import datetime
import pandas as pd
import tweepy
import time
import random
import traceback
from dbs.db import * class Twitter_Spider():
def __init__(self):
self.main_tw_url="https://twitter.com/{}/status/{}"
self.china_time_list = []
self.twitter_id_list = []
self.twitter_url_list = []
self.twitter_text_list = []
self.twitter_url_list = []
self.update_time_list = []
self.twitter_dicts = {}
self.user_id_list = []
self.user_name_list = []
self.crate_time_list=[]
self.userdicts={}
self.stopflag=False def getapi(self):
consumer_key = 'IAaj345Xf673kzT2'
consumer_secret = 'ee9WEQ235555We0gP4peRbOPeeHGX1'
access_token = '9767625356VEnq7s9ZXOHEI'
access_secret = 'lyqj2122333o9G4fHta'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
return api def start(self):
api =None
try:
api = self.getapi()
except tweepy.TweepError:
print('Error! Failed to get access token.') nowdate = datetime.datetime.now()
beforeweek=nowdate-datetime.timedelta(days=7) #一周前
#只获取关注者id
# ids = []
# for page in tweepy.Cursor(api.friends_ids).pages():
# ids.extend(page)
# 获取关注者id和screen_name for pages in tweepy.Cursor(api.friends).pages():
for page in pages:
userstr = page._json
self.userdicts.setdefault(userstr.get("id"),userstr.get("screen_name"))
for user_id in self.userdicts.keys():
self.stopflag=False
self.twitter_dicts.clear()
for i,statuslist in enumerate(tweepy.Cursor(api.user_timeline, id=user_id).pages()): #获取第一页,一页20个
if not self.stopflag:
print("start page:{}".format(i))
for status in statuslist:
jsonstr = status._json
if self.getItem(jsonstr,beforeweek,user_id):
break
else:
break
time.sleep(random.randint(2, 6))
self.twitter_dicts.setdefault("user_id", self.user_id_list)
self.twitter_dicts.setdefault("user_name", self.user_name_list)
self.twitter_dicts.setdefault("china_time",self.china_time_list)
self.twitter_dicts.setdefault("tw_time", self.crate_time_list)
self.twitter_dicts.setdefault("tw_id", self.twitter_id_list)
self.twitter_dicts.setdefault("tw_text", self.twitter_text_list)
self.twitter_dicts.setdefault("tw_url", self.twitter_url_list)
self.twitter_dicts.setdefault("updatetime", self.update_time_list)
try:
SaveData().save_object_data(self.twitter_dicts)
except:
print(traceback.format_exc("insert db error")) def getItem(self, jsonstr,beforeweek,user_id):
create_time = jsonstr.get("created_at")
china_time=""
try:
china_time=datetime.datetime.strptime(create_time,"%a %b %d %H:%M:%S +0000 %Y")
if beforeweek>china_time:
self.stopflag=True
return self.stopflag
except:
print(traceback.format_exc())
screen_name=self.userdicts.get(user_id)
self.china_time_list.append(china_time)
self.crate_time_list.append(create_time)
self.user_id_list.append(user_id)
self.user_name_list.append(screen_name)
twitter_id = jsonstr.get("id")
self.twitter_id_list.append(twitter_id)
self.twitter_url_list.append(self.main_tw_url.format(screen_name,twitter_id))
twitter_text = jsonstr.get("text")
print(twitter_text)
self.twitter_text_list.append(twitter_text)
self.update_time_list.append(datetime.datetime.now())
return self.stopflag if __name__ == "__main__":
Twitter_Spider().start()

  

最新文章

  1. [译]AngularJS中DOM操作
  2. 使用Apache Server 的ab进行web请求压力测试
  3. 网页 css 样式 初始化
  4. Codeforces 260 A - A. Laptops
  5. C/C++程序员必须熟练应用的开源项目[转]
  6. CN今日凌晨出现全部瘫痪的故障,持续近6个小时
  7. mysql中使用update select
  8. QT变异版本下载(SJLJ长跳转,DWARF不传递错误(32位专用),SEH(64位专用)),以及QT的实验室项目
  9. LeetCode_Permutations
  10. 由查找session IP 展开---函数、触发器、包
  11. 静态关键字static用法。
  12. ps图层的基本使用
  13. (最短路 Floyd) P2910 [USACO08OPEN]寻宝之路Clear And Present Danger 洛谷
  14. POJ 3436 ACM Computer Factory 最大流,拆点 难度:1
  15. Zero-Copy技术
  16. 微信小程序组件 模块化错和叹号
  17. 五、python小功能记录——打包程序
  18. BMI计算器
  19. arp_annouce=2详解
  20. view的superview的变换

热门文章

  1. 标准H5文件头的写法
  2. (原创)白话KMP算法(续)
  3. LeetCode 92 ——反转链表 II
  4. 启动 SQL Server 管理 Studio 在 SQL Server 2008R2 中的错误消息:"无法读取此系统上以前注册的服务器的列表" 解决方法
  5. JavaScript - arguments object
  6. BZOJ 4408 FJOI2016 神秘数 可持久化线段树
  7. 【翻译】ASP.NET Core 入门
  8. 个人作业Week3-案例分析(201521123103 吴雅娟)
  9. 基于log4j的消息流的实现之二消息传递
  10. 后缀数组 模板题 hdu1403(最长公共(连续)子串)