__main__ — Top-level script environment
2024-10-07 06:44:29
w
29.4. __main__ — Top-level script environment — Python 3.6.1 documentation https://docs.python.org/3/library/__main__.html
D:\pyTOgo\mongoTrans.py
from tool import *
import re '''
危险区,操作不可逆
开始--》
'''
# 批量删除
# deleteMany({'spiderDate': '20180903'}, 'todayUrls')
'''
危险区,操作不可逆
结束《--
'''
'''
以下为
可逆操作
''' # 更新日期字符串冗余- .replace('-', '')
# 更新website字段冗余 .replace('URL:', '')
# cleanData = selectToDic('_id', 'todayUrls', fields={'webSite': 1, 'spiderDate': 1})
# cleanData = selectToDic('_id', 'todayUrls', fields={'Base64parse2times': { '$exists': True}})
'''
collection_name = 'todayUrls'
cleanData = selectToDic('_id', collection_name, fields={'url': 1}, where={'spiderDate': '20180906'})
delIds = []
for i in cleanData:
_id = i
item = cleanData[i]
url = item['url']
pathTag = 'cnhan.com/pinfo/'
# 通过正则删除
if pathTag in url and re.match('^http://www.cnhan.com/pinfo/\d+\.html$',
url) is None:
print(_id, url)
deleteOne({'_id': _id}, collection_name)
''' def improve():
cleanData = selectToDic('_id', 'todayUrls', fields={})
for i in cleanData:
_id = i
item = cleanData[i]
spiderDate = item['spiderDate']
updateOneIdKV(_id, 'spiderDate', spiderDate.replace('-', ''))
print('improve', _id)
if 'webSite' in item:
webSite = item['webSite']
updateOneIdKV(_id, 'webSite', webSite.replace('URL:', '')) def uniqueUrlSpiderDate(collectionMame='todayUrls'):
'''
当日url + spiderDate留其一
:return:
'''
spiderDate_url_set = {}
cleanData = selectToDic('_id', collectionMame, fields={'spiderDate': 1, 'url': 1})
for i in cleanData:
_id = i
item = cleanData[i]
url, spiderDate = item['url'], item['spiderDate']
k = url + spiderDate
if k not in spiderDate_url_set:
spiderDate_url_set[k] = []
spiderDate_url_set[k].append(_id) save_id_l = []
for k in spiderDate_url_set:
save_id_l.append(spiderDate_url_set[k][0])
for i in cleanData:
_id = i
if _id not in save_id_l:
deleteOne({'_id': _id}, collectionMame)
print('uniqueUrlSpiderDate', _id) if __name__ == "__main__":
improve()
uniqueUrlSpiderDate()
uniqueUrlSpiderDate('siteUserPage') D:\pyTOgo\dataAppend.py
from tool import RandomString, selectToDic, updateOne
from mongoTrans import improve, uniqueUrlSpiderDate
from bs4 import BeautifulSoup
import requests, time, json, random cleanDbSwitcher = True if 7 > 9 else False
if cleanDbSwitcher:
improve()
uniqueUrlSpiderDate()
uniqueUrlSpiderDate('siteUserPage') a.py
def fa():
pass
def fa2():
pass
fa2()
b.py
import fa from a 没有
if __name__ == "__main__":
则执行fa2()
最新文章
- 手工配置rsyslog配置文件详解
- Qt之C语言有符号数与无符号数运算
- .net 制作二维码
- SSM项目配置随笔
- Junit3与Junit4的区别
- hdu1242 优先队列+bfs
- 用Nikto探测一个网站所用到的技术
- 回调--一个经典例子让你彻彻底底理解java回调机制
- js的replace方法
- Keil MDK 无法设置断点【worldsing】
- JAVA客户端API调用memcached两种方式
- 在MacOSX下用管理员权限打开App应用程序
- .Net程序员 Solr-5.3之旅 (一)Solr入门
- Python学习笔记整理(十)Python的if测试
- 只响应ccTouchBegan的问题
- 红米手机5 Plus完美刷成开发版获取root权限的教程
- python--自己实现的单链表常用功能
- laravel框架5.2版本组件包开发
- 解决myeclipse2017安装后闪退问题
- JS 判断 undefined 类型