diff --git a/rssant_api/migrations/0021_auto_20200418_0512.py b/rssant_api/migrations/0021_auto_20200418_0512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec58118350bbd3d331dc1976912fb9fd2b3c277c
--- /dev/null
+++ b/rssant_api/migrations/0021_auto_20200418_0512.py
@@ -0,0 +1,28 @@
+# Generated by Django 2.2.12 on 2020-04-18 05:12
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('rssant_api', '0020_feed_checksum_data'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='story',
+ name='audio_url',
+ field=models.TextField(blank=True, help_text='播客音频链接', null=True),
+ ),
+ migrations.AddField(
+ model_name='story',
+ name='iframe_url',
+ field=models.TextField(blank=True, help_text='视频iframe链接', null=True),
+ ),
+ migrations.AddField(
+ model_name='story',
+ name='image_url',
+ field=models.TextField(blank=True, help_text='图片链接', null=True),
+ ),
+ ]
diff --git a/rssant_api/models/story.py b/rssant_api/models/story.py
index b59eb071578fe99b6f164ac6ce2d552800961c21..5c8ae16190243f05c5262000e174d5a524e4940d 100644
--- a/rssant_api/models/story.py
+++ b/rssant_api/models/story.py
@@ -22,6 +22,10 @@ StoryDetailSchema = T.detail.fields("""
dt_watched
dt_favorited
""").extra_fields("""
+ author
+ image_url
+ audio_url
+ iframe_url
dt_synced
summary
content
@@ -58,6 +62,12 @@ class Story(Model, ContentHashMixin):
title = models.CharField(max_length=200, help_text="标题")
link = models.TextField(help_text="文章链接")
author = models.CharField(max_length=200, **optional, help_text='作者')
+ image_url = models.TextField(
+ **optional, help_text="图片链接")
+ audio_url = models.TextField(
+ **optional, help_text="播客音频链接")
+ iframe_url = models.TextField(
+ **optional, help_text="视频iframe链接")
has_mathjax = models.BooleanField(
**optional, default=False, help_text='has MathJax')
is_user_marked = models.BooleanField(
@@ -115,7 +125,12 @@ class Story(Model, ContentHashMixin):
offset = feed.total_storys
unique_ids = [x['unique_id'] for x in storys]
story_objects = {}
- q = Story.objects.filter(feed_id=feed_id, unique_id__in=unique_ids)
+ q = Story.objects\
+ .defer(
+ 'content', 'summary', 'title', 'author',
+ 'image_url', 'iframe_url', 'audio_url',
+ )\
+ .filter(feed_id=feed_id, unique_id__in=unique_ids)
for story in q.all():
story_objects[story.unique_id] = story
new_story_objects = []
@@ -140,6 +155,9 @@ class Story(Model, ContentHashMixin):
story.title = data["title"]
story.link = data["link"]
story.author = data["author"]
+ story.image_url = data['image_url']
+ story.iframe_url = data['iframe_url']
+ story.audio_url = data['audio_url']
story.has_mathjax = data['has_mathjax']
# 发布时间只第一次赋值,不更新
if not story.dt_published:
@@ -364,6 +382,22 @@ class UnionStory:
def link(self):
return self._story.link
+ @property
+ def author(self):
+ return self._story.author
+
+ @property
+ def image_url(self):
+ return self._story.image_url
+
+ @property
+ def iframe_url(self):
+ return self._story.iframe_url
+
+ @property
+ def audio_url(self):
+ return self._story.audio_url
+
@property
def has_mathjax(self):
return self._story.has_mathjax
diff --git a/rssant_api/views/story.py b/rssant_api/views/story.py
index dc04f0f73f9961bbf522429861d43adf48f04dd0..3910d1d8f97c9153980263a680d558205416fd40 100644
--- a/rssant_api/views/story.py
+++ b/rssant_api/views/story.py
@@ -18,6 +18,10 @@ StorySchema = T.dict(
unique_id=T.str.optional,
title=T.str.optional,
link=T.str.optional,
+ author=T.str.optional,
+ image_url=T.str.optional,
+ audio_url=T.str.optional,
+ iframe_url=T.str.optional,
has_mathjax=T.bool.optional,
dt_published=T.datetime.object.optional.invalid_to_default,
dt_updated=T.datetime.object.optional,
diff --git a/rssant_feedlib/finder.py b/rssant_feedlib/finder.py
index 52fe004a72807909cb8c522f262a932a7b118eab..cd88c9d38776bed4fdd71b9713f28214a6ed6daf 100644
--- a/rssant_feedlib/finder.py
+++ b/rssant_feedlib/finder.py
@@ -7,8 +7,7 @@ from validr import Invalid
from rssant_common.helper import coerce_url
-from .raw_parser import RawFeedParser, FeedParserError
-from .parser import FeedParser, FeedResult
+from .raw_parser import RawFeedParser, FeedParserError, RawFeedResult
from .reader import FeedReader
from .response import FeedResponse, FeedResponseStatus
from .processor import validate_url
@@ -240,7 +239,7 @@ class FeedFinder:
self._log(msg)
return res
- def _parse(self, response: FeedResponse) -> FeedResult:
+ def _parse(self, response: FeedResponse) -> RawFeedResult:
if response.feed_type.is_html:
msg = "the response content is HTML, not XML feed"
self._log(msg)
@@ -260,8 +259,6 @@ class FeedFinder:
msg = f"warnings: {';'.join(result.warnings)}"
self._log(msg)
LOG.warning(msg)
- parser = FeedParser()
- result = parser.parse(result)
return result
def _parse_html(self, response):
@@ -393,7 +390,7 @@ class FeedFinder:
self._guess_links()
self._guessed = True
- def find(self) -> Tuple[FeedResponse, FeedResult]:
+ def find(self) -> Tuple[FeedResponse, RawFeedResult]:
use_proxy = False
current_try = 0
should_abort = FeedResponseStatus.is_permanent_failure
diff --git a/rssant_feedlib/fulltext.py b/rssant_feedlib/fulltext.py
index 50b936cfca1386dc2c1d0b46210f8cc1aaca030f..8a7acd754b9ecb9dbec9b68abfe936bdfc428420 100644
--- a/rssant_feedlib/fulltext.py
+++ b/rssant_feedlib/fulltext.py
@@ -12,7 +12,7 @@ sentence_sep_s = [
r'“', r'”', r'‘', r'’', r'【', r'】', r'《', r'》', r'(', r')', r'〈', r'〉',
]
-RE_SENTENCE_SEP = re.compile(r'(?:\s*(?:{})\s*)+'.format('|'.join(sentence_sep_s)))
+RE_SENTENCE_SEP = re.compile(r'(?:[\s\d]*(?:{})[\s\d]*)+'.format('|'.join(sentence_sep_s)))
def split_sentences(text: str) -> List[str]:
diff --git a/rssant_feedlib/parser.py b/rssant_feedlib/parser.py
index 3a7df1ac52ecf06f923ac12bb5f2c499dad70b46..4e2852936693f37d496b50ee0aab9edfd671cb73 100644
--- a/rssant_feedlib/parser.py
+++ b/rssant_feedlib/parser.py
@@ -9,7 +9,7 @@ from .raw_parser import RawFeedResult, FeedParserError
from .feed_checksum import FeedChecksum
from rssant_api.helper import shorten
from .processor import (
- story_html_to_text, story_html_clean,
+ story_html_to_text, story_html_clean, story_extract_attach,
story_has_mathjax, process_story_links, normalize_url, validate_url,
)
@@ -42,6 +42,8 @@ StorySchema = T.dict(
summary=T.str.maxlen(_MAX_SUMMARY_LENGTH).optional,
has_mathjax=T.bool.optional,
image_url=T.url.invalid_to_default.optional,
+ iframe_url=T.url.invalid_to_default.optional,
+ audio_url=T.url.invalid_to_default.optional,
dt_published=T.datetime.object.optional,
dt_updated=T.datetime.object.optional,
author_name=T.str.maxlen(100).optional,
@@ -117,6 +119,14 @@ class FeedParser:
)
def _process_content(self, content, link):
+ # use loose=True to reserve iframe
+ content = story_html_clean(content, loose=True)
+ # extract video iframe, eg: bilibili.com
+ attach = None
+ is_short_story = content and len(content) < 2000
+ if is_short_story:
+ attach = story_extract_attach(content, base_url=link)
+ # clean again, remove iframe from content
content = story_html_clean(content)
content = process_story_links(content, link)
if len(content) > _MAX_CONTENT_LENGTH:
@@ -127,7 +137,7 @@ class FeedParser:
msg = 'story link=%r content length=%s still too large, will truncate it'
LOG.warning(msg, link, len(content))
content = content[:_MAX_CONTENT_LENGTH]
- return content
+ return content, attach
def _parse_story(self, story: dict, feed_url: str):
ident = story['ident'][:200]
@@ -139,10 +149,16 @@ class FeedParser:
valid_url = None
base_url = valid_url or feed_url
image_url = normalize_url(story['image_url'], base_url=base_url)
+ audio_url = normalize_url(story['audio_url'], base_url=base_url)
author_name = story_html_to_text(story['author_name'])[:100]
author_url = normalize_url(story['author_url'], base_url=base_url)
author_avatar_url = normalize_url(story['author_avatar_url'], base_url=base_url)
- content = self._process_content(story['content'], link=base_url)
+ iframe_url = None
+ content, attach = self._process_content(story['content'], link=base_url)
+ if attach:
+ iframe_url = attach.iframe_url
+ if (not audio_url) and attach.audio_url:
+ audio_url = attach.audio_url
if story['summary']:
summary = story_html_clean(story['summary'])
else:
@@ -157,6 +173,8 @@ class FeedParser:
summary=summary,
has_mathjax=has_mathjax,
image_url=image_url,
+ iframe_url=iframe_url,
+ audio_url=audio_url,
dt_published=story['dt_published'],
dt_updated=story['dt_updated'],
author_name=author_name,
diff --git a/rssant_feedlib/processor.py b/rssant_feedlib/processor.py
index 32b58621634417105f16e4b53974d604ffb1d864..f155f730b5f731b47dc74dfa40966fa39e6561b9 100644
--- a/rssant_feedlib/processor.py
+++ b/rssant_feedlib/processor.py
@@ -364,6 +364,34 @@ def story_readability(content):
return doc.summary(html_partial=True) or ""
+StoryAttach = namedtuple("StoryAttach", "iframe_url, audio_url")
+
+
+def _normalize_validate_url(url, base_url=None):
+ url = normalize_url(url, base_url=base_url)
+ if not url:
+ return None
+ try:
+ url = validate_url(url)
+ except Invalid:
+ url = None
+ return url
+
+
+def story_extract_attach(html, base_url=None) -> StoryAttach:
+ iframe_url = None
+ audio_url = None
+ dom = lxml_call(lxml.html.fromstring, html)
+ iframe_el = dom.find('.//iframe')
+ if iframe_el is not None:
+ iframe_url = _normalize_validate_url(iframe_el.get('src'), base_url=base_url)
+ audio_el = dom.find('.//audio')
+ if audio_el is not None:
+ audio_url = _normalize_validate_url(audio_el.get('src'), base_url=base_url)
+ attach = StoryAttach(iframe_url, audio_url)
+ return attach
+
+
RE_BLANK_LINE = re.compile(r'(\n\s*)(\n\s*)+')
lxml_html_parser = lxml.html.HTMLParser(
@@ -446,7 +474,7 @@ def story_html_to_text(content, clean=True):
RSSANT_HTML_SAFE_ATTRS = set(lxml_safe_attrs) | set(IMG_EXT_SRC_ATTRS)
RSSANT_HTML_SAFE_ATTRS.update({'srcset'})
-lxml_story_html_cleaner = Cleaner(
+_html_cleaner_options = dict(
scripts=True,
javascript=True,
comments=True,
@@ -455,7 +483,6 @@ lxml_story_html_cleaner = Cleaner(
meta=True,
page_structure=True,
processing_instructions=True,
- embedded=True,
frames=True,
forms=True,
annoying_tags=True,
@@ -463,11 +490,42 @@ lxml_story_html_cleaner = Cleaner(
safe_attrs=RSSANT_HTML_SAFE_ATTRS,
add_nofollow=True,
remove_tags=set(['body']),
- kill_tags=set(['noscript']),
+ kill_tags=set(['noscript', 'iframe', 'embed']),
+)
+
+
+class FeedLooseHTMLCleaner(Cleaner):
+ """
+ https://lxml.de/api/lxml.html.clean.Cleaner-class.html
+ https://lxml.de/api/lxml.html.clean-pysrc.html#Cleaner.allow_embedded_url
+ """
+
+ def allow_embedded_url(self, el, url):
+ """
+ Decide whether a URL that was found in an element's attributes or text
+ if configured to be accepted or rejected.
+
+ :param el: an element.
+ :param url: a URL found on the element.
+ :return: true to accept the URL and false to reject it.
+ """
+ if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
+ return False
+ return True
+
+
+lxml_story_html_cleaner = Cleaner(
+ **_html_cleaner_options,
+ embedded=True,
+)
+lxml_story_html_loose_cleaner = FeedLooseHTMLCleaner(
+ **_html_cleaner_options,
+ embedded=False, # allow iframe
+ whitelist_tags=['iframe'],
)
-def story_html_clean(content):
+def story_html_clean(content, loose=False):
"""
>>> content = '''
...
@@ -493,11 +551,21 @@ def story_html_clean(content):
>>> # lxml can not parse below content, we handled the exception
>>> content = ''
>>> assert story_html_clean(content)
+ >>> # loose cleaner allow iframe, not allow embed flash
+ >>> content = ''
+ >>> story_html_clean(content)
+ ''
+ >>> 'iframe' in story_html_clean(content, loose=True)
+ True
+ >>> content = '