6 Star 5 Fork 3

Gitee 极速下载 / Prose-Go

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/jdkato/prose
克隆/下载
document.go 3.35 KB
一键复制 编辑 原始数据 按行查看 历史
Eric Gaudet 提交于 2020-08-19 16:49 . Introduce Tokenizer interface (#61)
package prose
// A DocOpt represents a setting that changes the document creation process.
//
// For example, it might disable named-entity extraction:
//
// doc := prose.NewDocument("...", prose.WithExtraction(false))
type DocOpt func(doc *Document, opts *DocOpts)
// DocOpts controls the Document creation process:
type DocOpts struct {
Extract bool // If true, include named-entity extraction
Segment bool // If true, include segmentation
Tag bool // If true, include POS tagging
Tokenizer Tokenizer // If true, include tokenization
}
// UsingTokenizer specifies the Tokenizer to use.
func UsingTokenizer(include Tokenizer) DocOpt {
return func(doc *Document, opts *DocOpts) {
// Tagging and entity extraction both require tokenization.
opts.Tokenizer = include
}
}
// WithTokenization can enable (the default) or disable tokenization.
// Deprecated: use UsingTokenizer instead.
func WithTokenization(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
if !include {
opts.Tokenizer = nil
}
}
}
// WithTagging can enable (the default) or disable POS tagging.
func WithTagging(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Tag = include
}
}
// WithSegmentation can enable (the default) or disable sentence segmentation.
func WithSegmentation(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Segment = include
}
}
// WithExtraction can enable (the default) or disable named-entity extraction.
func WithExtraction(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Extract = include
}
}
// UsingModel can enable (the default) or disable named-entity extraction.
func UsingModel(model *Model) DocOpt {
return func(doc *Document, opts *DocOpts) {
doc.Model = model
}
}
// A Document represents a parsed body of text.
type Document struct {
Model *Model
Text string
// TODO: Store offsets (begin, end) instead of `text` field.
entities []Entity
sentences []Sentence
tokens []*Token
}
// Tokens returns `doc`'s tokens.
func (doc *Document) Tokens() []Token {
tokens := make([]Token, 0, len(doc.tokens))
for _, tok := range doc.tokens {
tokens = append(tokens, *tok)
}
return tokens
}
// Sentences returns `doc`'s sentences.
func (doc *Document) Sentences() []Sentence {
return doc.sentences
}
// Entities returns `doc`'s entities.
func (doc *Document) Entities() []Entity {
return doc.entities
}
var defaultOpts = DocOpts{
Tokenizer: NewIterTokenizer(),
Segment: true,
Tag: true,
Extract: true,
}
// NewDocument creates a Document according to the user-specified options.
//
// For example,
//
// doc := prose.NewDocument("...")
func NewDocument(text string, opts ...DocOpt) (*Document, error) {
var pipeError error
doc := Document{Text: text}
base := defaultOpts
for _, applyOpt := range opts {
applyOpt(&doc, &base)
}
if doc.Model == nil {
doc.Model = defaultModel(base.Tag, base.Extract)
}
if base.Segment {
segmenter := newPunktSentenceTokenizer()
doc.sentences = segmenter.segment(text)
}
if base.Tokenizer != nil {
doc.tokens = append(doc.tokens, base.Tokenizer.Tokenize(text)...)
}
if base.Tag || base.Extract {
doc.tokens = doc.Model.tagger.tag(doc.tokens)
}
if base.Extract {
doc.tokens = doc.Model.extracter.classify(doc.tokens)
doc.entities = doc.Model.extracter.chunk(doc.tokens)
}
return &doc, pipeError
}
Go
1
https://gitee.com/mirrors/Prose-Go.git
git@gitee.com:mirrors/Prose-Go.git
mirrors
Prose-Go
Prose-Go
master

搜索帮助