from openpyxl import Workbook
import xlrd
import time
import Levenshtein as Le target_city_list = ['北京市', '上海市', '深圳市', '广州市']
source_name = 'JMTool任务_csv_py_wholeCSV-加百度170826165729'
BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|'
FEXCEL = '%s%s' % (source_name, '.xlsx') weight_ratio, weight_seqratio = 0.7, 0.3 def main_():
global source_name
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic = {}
for i in range(0, nrows):
l = table.row_values(i)
dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list = l
if dbid == 'dbid':
continue
if city not in target_city_list:
continue
if city not in res_dic:
res_dic[city] = {}
if district not in res_dic[city]:
res_dic[city][district] = {}
if name_ not in res_dic[city][district]:
res_dic[city][district][name_] = [] if BDpoi_list.find(BDpoi_list_tag) == -1:
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, '', '', ''
res_dic[city][district][name_].append(ll)
else:
addr_ = '%s%s%s%s' % (city, district, address, city_street)
chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_, addr_], BDpoi_list.split(
BDpoi_list_tag), {}
for ii in cmp_list:
if len(ii) == 0:
continue
cmp_, BD_name, BD_addr = ['', ''], '', ''
cmp_one = ii.split(BDpoi_list_tagb)
if len(cmp_one) == 2:
# format data -fair
BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
else:
BD_name = cmp_[0] = cmp_one[0]
ratio_res, seqratio_res = Le.ratio(name_, BD_name), Le.seqratio(chk_name_lsit, cmp_)
ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
for ratio_seqratio_res in sorted_seqratio_res_list:
lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
for vl in lll:
res_dic[city][district][name_].append(vl) wb = Workbook()
worksheet = wb.active
file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
for ll in l:
worksheet.append(ll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) wb = Workbook()
worksheet = wb.active
file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
lll = l[-1] worksheet.append(lll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) main_()

  

from openpyxl import Workbook
import xlrd
import time
import Levenshtein as Le target_city_list = ['深圳市']
BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|' source_name = 'JMTool任务_csv_py_wholeCSV_住宅小区-加百度170826152533'
FEXCEL = '%s%s' % (source_name, '.xlsx')
weight_ratio, weight_seqratio = 0.7, 0.3 def main_():
global source_name
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic = {}
for i in range(0, nrows):
l = table.row_values(i)
dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list = l
if dbid == 'dbid':
continue
if city not in target_city_list:
continue
if city not in res_dic:
res_dic[city] = {}
if district not in res_dic[city]:
res_dic[city][district] = {}
if name_ not in res_dic[city][district]:
res_dic[city][district][name_] = [] if BDpoi_list.find(BDpoi_list_tag) == -1:
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, '', '', ''
res_dic[city][district][name_].append(ll)
else:
addr_ = '%s%s%s%s' % (city, district, address, city_street)
chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_reduction, addr_], BDpoi_list.split(
BDpoi_list_tag), {}
for ii in cmp_list:
if len(ii) == 0:
continue
cmp_, BD_name, BD_addr = ['', ''], '', ''
cmp_one = ii.split(BDpoi_list_tagb)
if len(cmp_one) == 2:
# format data -fair
BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
else:
BD_name = cmp_[0] = cmp_one[0]
ratio_res, seqratio_res = Le.ratio(name_reduction, BD_name), Le.seqratio(chk_name_lsit, cmp_)
ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
for ratio_seqratio_res in sorted_seqratio_res_list:
lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
for vl in lll:
res_dic[city][district][name_].append(vl) wb = Workbook()
worksheet = wb.active
file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
for ll in l:
worksheet.append(ll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) wb = Workbook()
worksheet = wb.active
file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
lll = l[-1] worksheet.append(lll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) main_()

  

最新文章

  1. 一次U盘拯救的经历
  2. 【USACO 1.5】Prime Palindromes
  3. ACE - Ubuntu下环境搭建
  4. Linux-LVS+keepalived-Testing
  5. sqlserver临时表排序问题
  6. Hacker(18)----了解Windows系统漏洞
  7. AjaxPro使用说明文档
  8. beego: 获取request参数
  9. Intellij IDEA 15 如何同时打开多个项目
  10. PHP源代码加密
  11. Spring知识点回顾(08)spring aware
  12. RPM-GPG-KEY详解
  13. EasyUI + ajax + treegrid/datagrid 接收 json 数据,显示树状/网状表结构
  14. Robot Framework - Tips
  15. Python 入门基础17 --加密、表格、xml模块
  16. 状压dp-----三进制
  17. html5-label标签
  18. Eureka的高可用
  19. java 異常抛出 throw 與 return
  20. django 之 发送邮箱

热门文章

  1. BP神经网络(手写数字识别)
  2. 【Hadoop】三句话告诉你 mapreduce 中MAP进程的数量怎么控制?
  3. elasticsearch 基础性操作
  4. jQuery的DOM操作之加入元素和删除元素
  5. 深入浅出java静态代理和动态代理
  6. 3D版翻页公告效果
  7. Install RabbitMQ server in CentOS 7
  8. NSNotification的几点说明
  9. Ansible@一个高效的配置管理工具--Ansible configure management--翻译(五)
  10. SVN服务端的安装搭建(Linux)