以下三种方式

# -*- coding:utf-8 -*-
import numpy as np
import os
from sklearn import datasets def data_generator(input_filename, batch_size):
"""
:param input_filename:
:param batch_size:
:return:
"""
feature_size = 3
labels = np.zeros(batch_size)
rets = np.empty(shape=[batch_size, feature_size])
i = 0
for line in open(input_filename, "r"):
data = line.split(" ")
label = int(float(data[0]))
ids = []
values = []
for fea in data[1:]:
id, value = fea.split(":")
if int(id) > feature_size - 1:
break
ids.append(int(id))
values.append(float(value))
ret = np.zeros([1, feature_size])
for (index, d) in zip(ids, values):
ret[0][index] = d
labels[i] = int(label)
rets[i] = ret
i += 1
if i > batch_size - 1:
i = 0
yield labels, rets[0:, 0:3] def get_data(input_filename, batch_size):
oneline = 16294 # 一行多少个字节
feature_size = 1947
batch = 0
while True:
data = datasets.load_svmlight_file(input_filename, offset=oneline * batch_size * batch,
length=oneline * batch_size,
n_features=feature_size)
features = data[0]
labels = data[1]
if features.shape[0] > 0: # 保证返回和数据的有效性
batch += 1
yield labels, features[0:, 0:3]
else:
raise StopIteration def get_data_all(input_filename, batch_size):
data = datasets.load_svmlight_file(input_filename)
features = data[0]
labels = data[1]
batch = 0
while True:
start_index = batch * batch_size
end_index = (batch + 1) * batch_size if features.shape[0] > end_index:
yield labels[start_index:end_index], features[start_index:end_index, 0:3]
batch += 1
else:
raise StopIteration if __name__ == "__main__":
print("====", os.getcwd())
filename = "/home/part-00000"
generator = data_generator(filename, 10)
labels, features = generator.next()
print([labels])
print(features) generator = get_data_all(filename, 1000)
while True:
labels, features = generator.next()
print 'data', len(labels), features.shape

对于需要循环多次调用方法的,可以使用缓存,需要注意的是,缓存不能直接加在yiled函数上

# -*- coding:utf-8 -*-
import numpy as np
from sklearn.externals.joblib import Memory
import os
import random
from sklearn import datasets mem = Memory("/tmp/mycache") def get_data_batch(input_filename, batch_size):
data = get_data(input_filename)
features = data[0]
labels = data[1]
batch = 0
while True:
start_index = batch * batch_size
end_index = (batch + 1) * batch_size if features.shape[0] > end_index:
yield labels[start_index:end_index], features[start_index:end_index]
batch += 1
else:
raise StopIteration @mem.cache
def get_data(input_filename):
return datasets.load_svmlight_file(input_filename)

最新文章

  1. LAMP安装各种问题解决方案
  2. angularjs中$http、$location、$watch及双向数据绑定学习实现简单登陆验证
  3. Mac 安装工具包brew
  4. SqlServer-COMPUTE BY
  5. Struts流程分析+源码分析
  6. JavaScript之语句,循环
  7. asp.net(C#)利用QRCode生成二维码
  8. JAVA基础知识总结:三
  9. Linux指令--kill
  10. c++_day5_成员指针
  11. 如何让Node.js运行在浏览器端
  12. (转)GANs and Divergence Minimization
  13. 并发,同步锁,Runnable,Callable,Future
  14. 解决xadmin登录卡顿延迟的问题
  15. apache2.2服务无法启动 发生服务特定错误:1 的解决办法 (windows服务错误 日志查看方法)
  16. Redis入门到高可用(六)—— 字符串
  17. php emoji mysql保存和搜索
  18. 原创:超简单!windows配置NDK开发环境使用JNI
  19. javascript创建对象之工厂模式(一)
  20. Linux中目录proc/net/dev详解

热门文章

  1. iOS-tableView本地动画刷新
  2. sql server数据库,禁用启用触发器各种情况!
  3. spring boot打包会有.war.original文件的原因 (笔记)
  4. ASP.NET MVC 数据库依赖缓存
  5. cocos2d-X学习之主要类介绍:CCDirector
  6. Less-mixin函数基础一
  7. iOS接收远程通知响应方法
  8. Json对象与Json字符串的转化
  9. Java加密技术(四)非对称加密算法RSA
  10. (4.3)ODBC/OLE DB/ADO概念与使用情况