# -*- coding:utf-8 -*-

import sys
import re
import numpy as np
from sklearn.externals import joblib
import csv
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cross_validation
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE #处理域名的最小长度
MIN_LEN=10 #随机程度
random_state = 170 def load_alexa(filename):
domain_list=[]
csv_reader = csv.reader(open(filename))
for row in csv_reader:
domain=row[1]
if domain >= MIN_LEN:
domain_list.append(domain)
return domain_list def load_dga(filename):
domain_list=[]
#xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
# http://osint.bambenekconsulting.com/manual/cl.txt
with open(filename) as f:
for line in f:
domain=line.split(",")[0]
if domain >= MIN_LEN:
domain_list.append(domain)
return domain_list def nb_dga():
x1_domain_list = load_alexa("../data/top-1000.csv")
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt") x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list)) y1=[0]*len(x1_domain_list)
y2=[1]*len(x2_domain_list)
y3=[2]*len(x3_domain_list) y=np.concatenate((y1, y2,y3)) print x_domain_list
cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
token_pattern=r"\w", min_df=1)
x= cv.fit_transform(x_domain_list).toarray() clf = GaussianNB()
print cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=3) def kmeans_dga():
x1_domain_list = load_alexa("../data/dga/top-100.csv")
x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt")
x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt") x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))
#x_domain_list = np.concatenate((x1_domain_list, x2_domain_list)) y1=[0]*len(x1_domain_list)
y2=[1]*len(x2_domain_list)
y3=[1]*len(x3_domain_list) y=np.concatenate((y1, y2,y3))
#y = np.concatenate((y1, y2)) #print x_domain_list cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
token_pattern=r"\w", min_df=1)
x= cv.fit_transform(x_domain_list).toarray()
model=KMeans(n_clusters=2, random_state=random_state)
y_pred = model.fit_predict(x)
#print y_pred tsne = TSNE(learning_rate=100)
x=tsne.fit_transform(x)
print x
print x_domain_list for i,label in enumerate(x):
#print label
x1,x2=x[i]
if y_pred[i] == 1:
plt.scatter(x1,x2,marker='o')
else:
plt.scatter(x1, x2,marker='x')
#plt.annotate(label,xy=(x1,x2),xytext=(x1,x2)) plt.show() if __name__ == '__main__':
#nb_dga()
kmeans_dga()

最新文章

  1. 机器学习——Logistic回归
  2. Oracle常用函数
  3. 2015暑假多校联合---Mahjong tree(树上DP 、深搜)
  4. iar 数据类型 int folat
  5. SpringMVC4+thymeleaf3的一个简单实例(篇二:springMVC与thymeleaf的整合)
  6. plsql编程中游标的使用
  7. JAVA byte有无符号数的转换
  8. 一文搞懂各种 Docker 网络 - 每天5分钟玩转 Docker 容器技术(72)
  9. 网易云音乐APP分析
  10. OO_BLOG2_多线程电梯模拟
  11. Docker制作基础镜像
  12. Oracle ORA-01940: 无法删除当前连接的用户
  13. 关于SqlCommand对象的2个方法:ExecuteNonQuery 方法和ExecuteScalar方法
  14. jQuery插件初级练习3答案
  15. TensorFlow学习之 图像预处理
  16. Latest SoC
  17. node获取头信息数据
  18. MATLAB画图设置长宽。并高清复制
  19. 求LCA最近公共祖先的在线ST算法_C++
  20. Nginx 变量漫谈

热门文章

  1. gwt学习资料
  2. php静态函数的使用场景
  3. php实现简单的学生管理系统
  4. UESTC--1271--Search gold(贪心)
  5. hdoj--1248--寒冰王座(完全背包)
  6. Java-MyBatis:MyBatis XML 文件
  7. TCP和UDP的具体区别
  8. C#学习小记
  9. $(function(){});里的方法无效问题
  10. R 连接DB2数据库