#!/usr/bin/python
# -*- coding: UTF-8 -*- import xml.sax
import io, sys paper_tags = ('article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www') sub_tags = ('publisher', 'journal', 'booktitle') ret = [] class DBLPHandler(xml.sax.ContentHandler): def __init__(self):
self.id = 1
self.reset() def reset(self):
self.dup_article = 0
self.curtag = None
self.author = ''
self.title = ''
self.pages = ''
self.year = ''
self.volume = ''
self.journal = ''
self.number = ''
self.url = ''
self.ee = '' def write_to_file(self, filename):
file_object = file(filename, 'a+')
for line in ret:
file_object.write(line.encode('utf8'))
#file_object.write('\n')
file_object.close() def record_row(self):
ret.append(u''.join((self.author, self.title, self.year, self.pages, self.journal, self.ee, '\n')).replace(' ', ''))
#ret.append(self.author + self.title + self.year + self.pages+ self.journal + self.ee)
#ret.append((self.author, self.title, self.year, self.pages, self.journal, self.ee))
#print (self.author, self.title, self.year, self.pages) def startElement(self, tag, attributes):
if tag != None and len(tag.strip()) > 0:
if tag == 'article':
self.dup_article += 1
self.curtag = tag def endElement(self, tag):
if tag != None and len(tag.strip()) > 0:
if tag == 'article':
self.record_row()
self.reset() def characters(self, content):
if content != '\n':
if self.curtag == "title":
self.title = content.strip()
elif self.curtag == "author":
self.author = content.strip()
elif self.curtag == "year":
self.year = content.strip()
elif self.curtag == "ee":
self.ee = content.strip()
elif self.curtag == "journal":
self.journal = content.strip()
elif self.curtag == "pages":
self.pages = content.strip()
elif self.url == "url":
self.url = content.strip()
elif self.number == "number":
self.number = content.strip()
elif self.number == "volume":
self.volume = content.strip() if (__name__ == "__main__"):
filename = 'dblp.xml'
if len(sys.argv) == 2:
filename = sys.argv[1]
# 创建一个 XMLReader
parser = xml.sax.make_parser()
# turn off namepsaces
parser.setFeature(xml.sax.handler.feature_namespaces, 0) # 重写 ContextHandler
Handler = DBLPHandler()
parser.setContentHandler(Handler) parser.parse(filename)
print 'Parser Complete!'
Handler.write_to_file('out')

另外附处理DNA数据的脚本程序:

lens_DNA = [0, 1000, 2000, 2500, 500, 1000, 1500, 2000, 2500]
lens_DBLP = [0, 40, 120, 200, 40, 80, 120, 160, 200] file_id = 1
LINE_MAX = 100 class DNA_Handler:
def __init__(self):
self.strn = '' def write_to_file(self, filename):
file_object = open(filename, 'a+')
file_object.write(self.strn)
file_object.close() def read_file(self, filename):
fo = open(filename, 'r')
line = fo.readline()
self.strn = ''
file_id = 1
cnt_lines = 0
while line and file_id < 9:
line = line.replace('\n', '')
self.strn += line
if len(self.strn) > lens_DNA[file_id]:
self.strn = self.strn[0: lens_DNA[file_id]] + '\n'
print self.strn
if file_id <= 3:
self.write_to_file('DNA_N' + str(file_id))
else:
self.write_to_file('DNA_M' + str(file_id - 3))
self.strn = ''
cnt_lines += 1
if cnt_lines >= LINE_MAX:
file_id += 1
cnt_lines = 0
line = fo.readline()
fo.close()
print 'read_finished!' class DBLP_Handler: def __init__(self):
self.strn = '' def write_to_file(self, filename):
file_object = open(filename, 'a+')
file_object.write(self.strn)
file_object.close() def read_file(self, filename):
fo = open(filename, 'r')
line = fo.readline()
self.strn = ''
file_id = 1
cnt_lines = 0
while line and file_id < 9:
line = line.replace('\n', '')
self.strn += line
if len(self.strn) > lens_DBLP[file_id]:
self.strn = self.strn[0: lens_DBLP[file_id]] + '\n'
print self.strn
self.write_to_file('DBLP_' + str(file_id))
self.strn = ''
cnt_lines += 1
if cnt_lines >= LINE_MAX:
file_id += 1
cnt_lines = 0
line = fo.readline()
fo.close()
print 'read_finished!' if (__name__ == '__main__'): dh = DNA_Handler()
dh.read_file('human_dna.fa')
'''
bblp_h = DBLP_Handler()
bblp_h.read_file('DBLP_data')
'''

   

最新文章

  1. CSS过滤器
  2. Java对象序列化---转载
  3. ES6 - promise对象
  4. Javascript开发之工具归纳
  5. windows添加和删除服务
  6. 2016年11月30日 星期三 --出埃及记 Exodus 20:21
  7. Java时间戳与日期格式字符串的互转
  8. js获取对象、数组的实际长度,元素实际个数
  9. oninput,onpropertychange,onchange的使用方法和差别
  10. nginx+apache 404错误页面
  11. About Spring
  12. 区间RMQ问题
  13. return返回两个三元表达式的和,返回不正确,同样要注意在JavaScript中,也是如此
  14. 收藏:SQL Server 数据库改名
  15. angular五种服务详解
  16. hdu 5035 指数分布无后效性
  17. 登录状态保持Session/Cookie
  18. shentou mianshiti
  19. SpringMVC(二)-- springmvc的系统学习之跳转结果的方式
  20. iOS 程序启动流程

热门文章

  1. winsock教程- windows下的socket编程(c语言实现)
  2. 黑马程序员——JAVA基础之装饰设计模式
  3. phpunit4.1的干净测试
  4. 7、java实现的两种单例模式
  5. SSIS 参数与环境
  6. HackerRank &quot;Minimum Penalty Path&quot;
  7. Mysql 自定义HASH索引带来的巨大性能提升----[挖坑篇]
  8. php_sapi_name详解
  9. (Design Pattern) Singleton.
  10. Java 开发必会的 Linux 命令