python 读取libsvm文件
2024-09-25 05:38:47
以下三种方式
# -*- coding:utf-8 -*-
import numpy as np
import os
from sklearn import datasets def data_generator(input_filename, batch_size):
"""
:param input_filename:
:param batch_size:
:return:
"""
feature_size = 3
labels = np.zeros(batch_size)
rets = np.empty(shape=[batch_size, feature_size])
i = 0
for line in open(input_filename, "r"):
data = line.split(" ")
label = int(float(data[0]))
ids = []
values = []
for fea in data[1:]:
id, value = fea.split(":")
if int(id) > feature_size - 1:
break
ids.append(int(id))
values.append(float(value))
ret = np.zeros([1, feature_size])
for (index, d) in zip(ids, values):
ret[0][index] = d
labels[i] = int(label)
rets[i] = ret
i += 1
if i > batch_size - 1:
i = 0
yield labels, rets[0:, 0:3] def get_data(input_filename, batch_size):
oneline = 16294 # 一行多少个字节
feature_size = 1947
batch = 0
while True:
data = datasets.load_svmlight_file(input_filename, offset=oneline * batch_size * batch,
length=oneline * batch_size,
n_features=feature_size)
features = data[0]
labels = data[1]
if features.shape[0] > 0: # 保证返回和数据的有效性
batch += 1
yield labels, features[0:, 0:3]
else:
raise StopIteration def get_data_all(input_filename, batch_size):
data = datasets.load_svmlight_file(input_filename)
features = data[0]
labels = data[1]
batch = 0
while True:
start_index = batch * batch_size
end_index = (batch + 1) * batch_size if features.shape[0] > end_index:
yield labels[start_index:end_index], features[start_index:end_index, 0:3]
batch += 1
else:
raise StopIteration if __name__ == "__main__":
print("====", os.getcwd())
filename = "/home/part-00000"
generator = data_generator(filename, 10)
labels, features = generator.next()
print([labels])
print(features) generator = get_data_all(filename, 1000)
while True:
labels, features = generator.next()
print 'data', len(labels), features.shape
对于需要循环多次调用方法的,可以使用缓存,需要注意的是,缓存不能直接加在yiled函数上
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.externals.joblib import Memory
import os
import random
from sklearn import datasets mem = Memory("/tmp/mycache") def get_data_batch(input_filename, batch_size):
data = get_data(input_filename)
features = data[0]
labels = data[1]
batch = 0
while True:
start_index = batch * batch_size
end_index = (batch + 1) * batch_size if features.shape[0] > end_index:
yield labels[start_index:end_index], features[start_index:end_index]
batch += 1
else:
raise StopIteration @mem.cache
def get_data(input_filename):
return datasets.load_svmlight_file(input_filename)
最新文章
- LAMP安装各种问题解决方案
- angularjs中$http、$location、$watch及双向数据绑定学习实现简单登陆验证
- Mac 安装工具包brew
- SqlServer-COMPUTE BY
- Struts流程分析+源码分析
- JavaScript之语句,循环
- asp.net(C#)利用QRCode生成二维码
- JAVA基础知识总结:三
- Linux指令--kill
- c++_day5_成员指针
- 如何让Node.js运行在浏览器端
- (转)GANs and Divergence Minimization
- 并发,同步锁,Runnable,Callable,Future
- 解决xadmin登录卡顿延迟的问题
- apache2.2服务无法启动 发生服务特定错误:1 的解决办法 (windows服务错误 日志查看方法)
- Redis入门到高可用(六)—— 字符串
- php emoji mysql保存和搜索
- 原创:超简单!windows配置NDK开发环境使用JNI
- javascript创建对象之工厂模式(一)
- Linux中目录proc/net/dev详解