1 Star 0 Fork 0

sheeplu / kenlm

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
clean.py 1.93 KB
一键复制 编辑 原始数据 按行查看 历史
yujishi 提交于 2017-10-23 20:35 . MOD:update
#coding:utf-8
import sys
import os
import jieba
import re
brand_dict = {}
def check_contain_chinese(check_str):
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return True
return False
def load_brand_dict():
with open("./brand_ext.dic") as f:
for line in f:
line = line.strip()
brand_dict[line] = 1
return
def clean():
jieba.load_userdict('userdict.txt')
g_pattern = re.compile("[0-9]{1,10}(ml|ML|g|cc|x2|x5|k|G)$")
n_pattern = re.compile(r"^(-?\d+)(\.\d*)?$")
black_dict = ['新款','爆款','男款','男士','女款','女士','男式','女式','男','女','配以','最最','个月量','万用','不伤','不怕','镇店之宝']
num_dict = ['一','二','三','四','五','六','七','八','九','十']
f_out = open("./train_data", "w")
with open("./goods_name") as f:
for line in f:
seg_words = jieba.cut(line)
final_words = []
for word in seg_words:
if len(word) == 1:
continue
if word.isdigit():
continue
if g_pattern.match(word):
continue
if n_pattern.match(word):
continue
if word in black_dict:
continue
#去掉纯英文
if not check_contain_chinese(word):
continue
#去掉品牌词
if word in brand_dict:
continue
num_flag = True
for w in word:
if w in num_dict:
num_flag = False
break
if not num_flag:
continue
final_words.append(word)
f_out.write(" ".join(final_words))
f_out.write("\n")
f_out.close()
return
if __name__ == "__main__":
clean()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/sheeplu/kenlm.git
git@gitee.com:sheeplu/kenlm.git
sheeplu
kenlm
kenlm
master

搜索帮助

344bd9b3 5694891 D2dac590 5694891