代码拉取完成,页面将自动刷新
同步操作将从 OpenHarmony-SIG/jsoup 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
快速且宽容的HTML解析器
按功能对应下载安装:
场景一:HTML操作:对HTML文档进行解析、提取、清理
npm install sanitize-html --save
npm install @types/sanitize-html --save-dev
场景二:HTML转化为整洁的XHTML
npm install @ohos/htmltoxml --save
OpenHarmony npm环境配置等更多内容,请参考 如何安装OpenHarmony npm包 。
import { Parser } from 'htmlparser2'
const parser = new Parser({
onopentag(name, attributes) {
console.info(`jsoup onopentag name --> ${name} attributes --> ${attributes}`)
},
ontext(text) {
console.info("jsoup text -->", text);
},
onopentagname(name) {
console.info("jsoup tagName -->", name);
},
onattribute(name, value) {
console.info(`jsoup attribName name --> ${name} value --> ${value}`)
},
onclosetag(tagname) {
console.info("jsoup closeTag --> ", tagname);
},
});
parser.write(html);
parser.end();
import { Parser } from 'htmlparser2'
import { DomHandler } from 'domhandler'
import * as DomUtils from 'domutils'
const handler = new DomHandler((error, dom) => {
if (error) {
// Handle error
} else {
// Parsing completed, do something
console.info('jsoup dom.toString()=' + dom + "");
let elements = DomUtils.getElementsByTagName('style', dom)
console.info('jsoup elements.length=', elements.length);
let element = elements[0]
console.info('jsoup element=', Object.keys(element));
let text = DomUtils.getText(elements)
console.info('jsoup text=', text);
}
});
const parser = new Parser(handler, { decodeEntities: true });
parser.write(html);
parser.end();
import { parseDocument } from 'htmlparser2'
import * as DomUtils from 'domutils'
let dom: Document = parseDocument(html)
// 通过DomUtils对解析过的Dom对象进行操作
// 根据标签名称获取元素
let element = DomUtils.getElementsByTagName('style', dom)
// 获取文本
let text = DomUtils.getText(element)
// 判断元素类型是否为tag
let isTag = DomUtils.isTag(element[0])
// 判断元素类型是否为CDATA
let isCDATA = DomUtils.isCDATA(element[0])
// 判断元素类型是否Text
let isText = DomUtils.isText(element[0])
// 判断元素类型是否为Comment
let isComment = DomUtils.isComment(element[0])
// 获取指定元素的子元素集
let childrens = DomUtils.getChildren(body[0])
import http from '@ohos.net.http';
let httpRequest = http.createHttp()
httpRequest.request('http://106.15.92.248/share/html.txt')
.then((data) => {
console.log("jsoup url html=" + JSON.stringify(data))
// TODO do something
if (data.result && typeof data.result === 'string') {
parser.write(data.result);
parser.end();
}
})
.catch((err) => {
console.error('jsoup connect error:' + JSON.stringify(err));
})
import fileio from '@ohos.fileio';
let buf = new ArrayBuffer(html.length)
stream.readSync(buf, {
offset: 0, length: html.length, position: 0
})
let dom = String.fromCharCode.apply(null, new Uint8Array(buf))
// TODO do something
parser.write(dom);
parser.end();
import util from '@ohos.util';
// 注意:需要先在MainAbility中为该变量赋值: globalThis.Context = this.context;
if (!globalThis.Context) {
console.log('jsoup global Context is undefined');
return;
}
globalThis.Context.resourceManager.getRawFile(filePath)
.then((data) => {
var textDecoder = new util.TextDecoder("utf-8", {
ignoreBOM: true
})
var result: string = textDecoder.decode(data, {
stream: false
})
// TODO do something
parser.write(result);
parser.end();
})
.catch((err) => {
console.log("jsoup getHtmlFromRawFile err=" + err)
})
import fileio from '@ohos.fileio';
if (!globalThis.Context) {
console.log('jsoup global Context is undefined');
return;
}
var filePath = globalThis.Context.filesDir + '/jsoup.html';
fileio.readText(filePath)
.then((data) => {
console.log("jsoup getHtmlFromFilePath text=" + data);
// TODO do something
parser.write(data);
parser.end();
})
.catch((err) => {
console.log("jsoup getHtmlFromFilePath err=" + err)
})
import SanitizeHtml from 'sanitize-html'
使用默认的标签和属性列表:
const clean = SanitizeHtml(dirty);
允许的特定的标签和属性不会被清除:
const clean = sanitizeHtml(dirty, {
allowedTags: [ 'b', 'i', 'em', 'strong', 'a' ],
allowedAttributes: {
'a': [ 'href' ]
},
allowedIframeHostnames: ['www.youtube.com']
});
在默认列表的基础上添加标签:
const clean = SanitizeHtml(dirty, {
allowedTags: SanitizeHtml.defaults.allowedTags.concat([ 'img' ])
});
将不允许的标签进行转义,而不是清除:
const clean = SanitizeHtml('before <img src="test.png" /> after', {
disallowedTagsMode: 'escape',
allowedTags: [],
allowedAttributes: false
})
允许所有标签或所有属性:
allowedTags: false,
allowedAttributes: false
不想允许任何标签:
allowedTags: [],
allowedAttributes: {}
在特定元素上允许特定的CSS类:
const clean = SanitizeHtml(dirty, {
allowedTags: [ 'p', 'em', 'strong' ],
allowedClasses: {
'p': [ 'fancy', 'simple' ]
}
});
在特定元素上允许特定的CSS样式
const clean = SanitizeHtml(dirty, {
allowedTags: ['p'],
allowedAttributes: {
'p': ["style"],
},
allowedStyles: {
'*': {
// Match HEX and RGB
'color': [/^#(0x)?[0-9a-f]+$/i, /^rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)$/],
'text-align': [/^left$/, /^right$/, /^center$/],
// Match any number with px, em, or %
'font-size': [/^\d+(?:px|em|%)$/]
},
'p': {
'font-size': [/^\d+rem$/]
}
}
});
const dirty='<ol><li>Hello world</li></ol>';
const clean = SanitizeHtml(dirty, {
transformTags: {
'ol': 'ul',
}
});
更改标签并且添加属性:
const dirty = '<ol foo="foo" bar="bar" baz="baz"><li>Hello world</li></ol>';
const clean = SanitizeHtml(dirty, {
transformTags: { ol: SanitizeHtml.simpleTransform('ul', { class: 'foo' }) },
allowedAttributes: { ul: ['foo', 'bar', 'class'] }
});
const clean = SanitizeHtml(dirty, {
transformTags: {
'a': function(tagName, attribs) {
return {
tagName: 'a',
attribs: attribs,
text: 'Some text'
};
}
}
});
例如,您可以转换缺少锚文本的链接元素:
<a href="http://somelink.com"></a>
到带有锚文本的链接:
<a href="http://somelink.com">Some text</a>
const dirty = '<p>This is <a href="http://www.linux.org"></a><br/>Linux</p>';
const clean = SanitizeHtml(dirty, {
exclusiveFilter: function (frame) {
return frame.tag === 'a' && !frame.text.trim();
}
});
import { XMLWriter } from '@ohos/htmltoxml'
let property = [{ key: XMLWriter.DOCTYPE_PUBLIC, value: '-//W3C//DTD XHTML 1.1//EN' },
{ key: XMLWriter.DOCTYPE_SYSTEM, value: 'http://www.w3.org/TR?xhtml11/DTD/xhtml11.dtd' }]
const xml = new XMLWriter(html, property);
xml.convertToXML((content, error) => {
})
类型定义:
// 解析器处理回调
interface Handler {
onparserinit(parser: Parser): void;
onreset(): void;
onend(): void;
onerror(error: Error): void;
onclosetag(name: string): void;
onopentagname(name: string): void;
onattribute(name: string, value: string, quote?: string | undefined | null): void;
onopentag(name: string, attribs: {
[s: string]: string;
}): void;
ontext(data: string): void;
oncomment(data: string): void;
oncdatastart(): void;
oncdataend(): void;
oncommentend(): void;
onprocessinginstruction(name: string, data: string): void;
}
// 解析器选项
interface ParserOptions {
decodeEntities?: boolean;
lowerCaseTags?: boolean;
lowerCaseAttributeNames?: boolean;
recognizeCDATA?: boolean;
}
// 清理HTML,抵御XSS攻击
declare namespace sanitize {
interface Attributes { [attr: string]: string; }
interface Tag { tagName: string; attribs: Attributes; text?: string ; }
type Transformer = (tagName: string, attribs: Attributes) => Tag;
type AllowedAttribute = string | { name: string; multiple?: boolean ; values: string[] };
type DisallowedTagsModes = 'discard' | 'escape' | 'recursiveEscape';
interface IDefaults {
allowedAttributes: Record<string, AllowedAttribute[]>;
allowedSchemes: string[];
allowedSchemesByTag: { [index: string]: string[] };
allowedSchemesAppliedToAttributes: string[];
allowedTags: string[];
allowProtocolRelative: boolean;
disallowedTagsMode: DisallowedTagsModes;
enforceHtmlBoundary: boolean;
selfClosing: string[];
}
interface IFrame {
tag: string;
attribs: { [index: string]: string };
text: string;
tagPosition: number;
}
interface IOptions {
allowedAttributes?: Record<string, AllowedAttribute[]> | false;
allowedStyles?: { [index: string]: { [index: string]: RegExp[] } } ;
allowedClasses?: { [index: string]: boolean | Array<string | RegExp> };
allowIframeRelativeUrls?: boolean ;
allowedSchemes?: string[] | boolean ;
allowedSchemesByTag?: { [index: string]: string[] } | boolean ;
allowedSchemesAppliedToAttributes?: string[] ;
allowProtocolRelative?: boolean ;
allowedTags?: string[] | false ;
allowVulnerableTags?: boolean ;
textFilter?: ((text: string, tagName: string) => string) ;
exclusiveFilter?: ((frame: IFrame) => boolean) ;
nonTextTags?: string[] ;
selfClosing?: string[] ;
transformTags?: { [tagName: string]: string | Transformer } ;
parser?: ParserOptions ;
disallowedTagsMode?: DisallowedTagsModes ;
enforceHtmlBoundary?: boolean ;
}
const defaults: IDefaults;
const options: IOptions;
function simpleTransform(tagName: string, attribs: Attributes, merge?: boolean): Transformer;
}
接口定义:
方法名 | 入参 | 接口描述 |
---|---|---|
new Parser(cbs: Partial<Handler> | null, options?: ParserOptions) | handler,ParserOptions | 创建HTML解析器 |
write(chunk: string): void | string | 向HTML解析器内写入数据,解析一大块数据并调用相应的回调。 |
end(chunk?: string): void | string | 解析缓冲区的末尾并清除堆栈,调用 onend。 |
parseComplete(data: string): void | string | 重置解析器,然后解析完整的文档并将其推送到处理程序。 |
parseDocument(data: string, options?: ParserOptions): Document | string,ParserOptions | 解析数据,返回结果文档。 |
SanitizeHtml(dirty: string, options?: sanitize.IOptions): string | string,sanitize.IOptions | 清理HTML,实现HTML可信化 |
new XMLWriter(html: string, property?: Array<option>) | string,Array<option> | 创建XHTML转换器对象 |
convertToXML(callback: (content: string | null, error?: Error) => void):void | callback | 将HTML转化为XHTML |
DomUtils接口定义参照:DomUtils
支持 OpenHarmony API version 9 及以上版本。
|---- jsoup
| |---- entry # 示例代码文件夹
| |----src/main/ets
| |pages
| |----addTag.ets
| |----index.ets
| |----showResult.ets
| |---- htmlToXml # 将HTML转化为XHTMl功能库
| |---- README.md # 安装使用方法
使用过程中发现任何问题都可以提 Issue 给我们,当然,我们也非常欢迎你给我们发 PR 。
本项目基于 MIT ,请自由地享受和参与开源。
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。