运行环境需求

 # All Import Statements Defined Here
# Note: Do not add to this list.
# All the dependencies you need, can be installed by running .
# ---------------- import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5 from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA START_TOKEN = '<START>'
END_TOKEN = '<END>' np.random.seed(0)
random.seed(0)
# ----------------

Question 1.1: Implement distinct_words [code] (2 points)

Write a method to work out the distinct words (word types) that occur in the corpus. You can do this with for loops, but it's more efficient to do it with Python list comprehensions. In particular, this may be useful to flatten a list of lists. If you're not familiar with Python list comprehensions in general, here's more information.

You may find it useful to use Python sets to remove duplicate words.

 def distinct_words(corpus):
""" Determine a list of distinct words for the corpus.
Params:
corpus (list of list of strings): corpus of documents
Return:
corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
num_corpus_words (integer): number of distinct words across the corpus
"""
corpus_words = []
num_corpus_words = -1 # ------------------
# Write your implementation here.
raw_corpus_words = [word for corpu in corpus for word in corpu]
corpus_words = list(set(raw_corpus_words))
corpus_words = sorted(corpus_words)
num_corpus_words = len(corpus_words) # ------------------ return corpus_words, num_corpus_words

Question 1.2: Implement compute_co_occurrence_matrix [code] (3 points)

Write a method that constructs a co-occurrence matrix for a certain window-size 

最新文章

  1. Jenkins持续集成 &amp; .NET
  2. sql语句,怎么取查询结果的位置
  3. ctrip
  4. JSON字符串和java对象的互转【json-lib】
  5. 源码编译安装screen
  6. 屠龙之路_狭路相逢勇者胜_EighthDay
  7. vSphere Client无法连接到服务器 出现未知错误的解决方法
  8. sp_change_users_login解决孤立用户问题
  9. MongoDB学习笔记——文档操作之查询
  10. Entity Framework 第七篇 简化排序
  11. flex中实现自动换行
  12. hdu 4618 Palindrome Sub-Array
  13. JAVA元运算符,一元运算符,二元运算符,三元运算符
  14. 提示29. 怎样避免延迟加载或Load()阅读器问题
  15. PHP完整环境搭建
  16. rust尝鲜
  17. 快速构建Windows 8风格应用13-SearchContract构建
  18. css盒模型研究
  19. 故障定位之查找附近点GeoHash研讨
  20. C# Split 根据组合字符进行拆分数组用法

热门文章

  1. Python的定时执行
  2. oracle多表关联update
  3. Xshell的一些使用方法和注意事项
  4. MySQL入门——在Linux下安装和卸载MariaDB
  5. Mac下安装adb
  6. Unity API学习笔记(1)
  7. [日常] git版本回退
  8. 在linux中怎么查看错误日志
  9. PHP转Go系列:字符串
  10. 5-3 可视化库Seaborn-变量分析绘图