代码拉取完成,页面将自动刷新
import jieba
import matplotlib.pyplot as plt
import nltk
from tqdm import tqdm
from config import *
from utils import normalizeString
def analyze_zh():
translation_path = os.path.join(train_translation_folder, train_translation_zh_filename)
with open(translation_path, 'r') as f:
sentences = f.readlines()
sent_lengths = []
for sentence in tqdm(sentences):
seg_list = list(jieba.cut(sentence.strip()))
# Update word frequency
sent_lengths.append(len(seg_list))
num_bins = 100
n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
title = 'Chinese Sentence Lengths Distribution'
plt.title(title)
plt.show()
def analyze_en():
translation_path = os.path.join(train_translation_folder, train_translation_en_filename)
with open(translation_path, 'r') as f:
sentences = f.readlines()
sent_lengths = []
for sentence in tqdm(sentences):
sentence_en = sentence.strip().lower()
tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
seg_list = list(jieba.cut(sentence.strip()))
# Update word frequency
sent_lengths.append(len(seg_list))
num_bins = 100
n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
title = 'English Sentence Lengths Distribution'
plt.title(title)
plt.show()
if __name__ == '__main__':
analyze_zh()
analyze_en()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。