1 Star 1 Fork 0

左令君 / Machine-Translation

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
analyze_data.py 1.44 KB
一键复制 编辑 原始数据 按行查看 历史
foamliu 提交于 2018-10-11 10:54 . update
import jieba
import matplotlib.pyplot as plt
import nltk
from tqdm import tqdm
from config import *
from utils import normalizeString
def analyze_zh():
translation_path = os.path.join(train_translation_folder, train_translation_zh_filename)
with open(translation_path, 'r') as f:
sentences = f.readlines()
sent_lengths = []
for sentence in tqdm(sentences):
seg_list = list(jieba.cut(sentence.strip()))
# Update word frequency
sent_lengths.append(len(seg_list))
num_bins = 100
n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
title = 'Chinese Sentence Lengths Distribution'
plt.title(title)
plt.show()
def analyze_en():
translation_path = os.path.join(train_translation_folder, train_translation_en_filename)
with open(translation_path, 'r') as f:
sentences = f.readlines()
sent_lengths = []
for sentence in tqdm(sentences):
sentence_en = sentence.strip().lower()
tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
seg_list = list(jieba.cut(sentence.strip()))
# Update word frequency
sent_lengths.append(len(seg_list))
num_bins = 100
n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
title = 'English Sentence Lengths Distribution'
plt.title(title)
plt.show()
if __name__ == '__main__':
analyze_zh()
analyze_en()
1
https://gitee.com/ling_jun_zuo/Machine-Translation.git
git@gitee.com:ling_jun_zuo/Machine-Translation.git
ling_jun_zuo
Machine-Translation
Machine-Translation
master

搜索帮助