diff --git a/rssant_api/migrations/0021_auto_20200418_0512.py b/rssant_api/migrations/0021_auto_20200418_0512.py new file mode 100644 index 0000000000000000000000000000000000000000..ec58118350bbd3d331dc1976912fb9fd2b3c277c --- /dev/null +++ b/rssant_api/migrations/0021_auto_20200418_0512.py @@ -0,0 +1,28 @@ +# Generated by Django 2.2.12 on 2020-04-18 05:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('rssant_api', '0020_feed_checksum_data'), + ] + + operations = [ + migrations.AddField( + model_name='story', + name='audio_url', + field=models.TextField(blank=True, help_text='播客音频链接', null=True), + ), + migrations.AddField( + model_name='story', + name='iframe_url', + field=models.TextField(blank=True, help_text='视频iframe链接', null=True), + ), + migrations.AddField( + model_name='story', + name='image_url', + field=models.TextField(blank=True, help_text='图片链接', null=True), + ), + ] diff --git a/rssant_api/models/story.py b/rssant_api/models/story.py index b59eb071578fe99b6f164ac6ce2d552800961c21..5c8ae16190243f05c5262000e174d5a524e4940d 100644 --- a/rssant_api/models/story.py +++ b/rssant_api/models/story.py @@ -22,6 +22,10 @@ StoryDetailSchema = T.detail.fields(""" dt_watched dt_favorited """).extra_fields(""" + author + image_url + audio_url + iframe_url dt_synced summary content @@ -58,6 +62,12 @@ class Story(Model, ContentHashMixin): title = models.CharField(max_length=200, help_text="标题") link = models.TextField(help_text="文章链接") author = models.CharField(max_length=200, **optional, help_text='作者') + image_url = models.TextField( + **optional, help_text="图片链接") + audio_url = models.TextField( + **optional, help_text="播客音频链接") + iframe_url = models.TextField( + **optional, help_text="视频iframe链接") has_mathjax = models.BooleanField( **optional, default=False, help_text='has MathJax') is_user_marked = models.BooleanField( @@ -115,7 +125,12 @@ class Story(Model, ContentHashMixin): offset = feed.total_storys unique_ids = [x['unique_id'] for x in storys] story_objects = {} - q = Story.objects.filter(feed_id=feed_id, unique_id__in=unique_ids) + q = Story.objects\ + .defer( + 'content', 'summary', 'title', 'author', + 'image_url', 'iframe_url', 'audio_url', + )\ + .filter(feed_id=feed_id, unique_id__in=unique_ids) for story in q.all(): story_objects[story.unique_id] = story new_story_objects = [] @@ -140,6 +155,9 @@ class Story(Model, ContentHashMixin): story.title = data["title"] story.link = data["link"] story.author = data["author"] + story.image_url = data['image_url'] + story.iframe_url = data['iframe_url'] + story.audio_url = data['audio_url'] story.has_mathjax = data['has_mathjax'] # 发布时间只第一次赋值,不更新 if not story.dt_published: @@ -364,6 +382,22 @@ class UnionStory: def link(self): return self._story.link + @property + def author(self): + return self._story.author + + @property + def image_url(self): + return self._story.image_url + + @property + def iframe_url(self): + return self._story.iframe_url + + @property + def audio_url(self): + return self._story.audio_url + @property def has_mathjax(self): return self._story.has_mathjax diff --git a/rssant_api/views/story.py b/rssant_api/views/story.py index dc04f0f73f9961bbf522429861d43adf48f04dd0..3910d1d8f97c9153980263a680d558205416fd40 100644 --- a/rssant_api/views/story.py +++ b/rssant_api/views/story.py @@ -18,6 +18,10 @@ StorySchema = T.dict( unique_id=T.str.optional, title=T.str.optional, link=T.str.optional, + author=T.str.optional, + image_url=T.str.optional, + audio_url=T.str.optional, + iframe_url=T.str.optional, has_mathjax=T.bool.optional, dt_published=T.datetime.object.optional.invalid_to_default, dt_updated=T.datetime.object.optional, diff --git a/rssant_feedlib/finder.py b/rssant_feedlib/finder.py index 52fe004a72807909cb8c522f262a932a7b118eab..cd88c9d38776bed4fdd71b9713f28214a6ed6daf 100644 --- a/rssant_feedlib/finder.py +++ b/rssant_feedlib/finder.py @@ -7,8 +7,7 @@ from validr import Invalid from rssant_common.helper import coerce_url -from .raw_parser import RawFeedParser, FeedParserError -from .parser import FeedParser, FeedResult +from .raw_parser import RawFeedParser, FeedParserError, RawFeedResult from .reader import FeedReader from .response import FeedResponse, FeedResponseStatus from .processor import validate_url @@ -240,7 +239,7 @@ class FeedFinder: self._log(msg) return res - def _parse(self, response: FeedResponse) -> FeedResult: + def _parse(self, response: FeedResponse) -> RawFeedResult: if response.feed_type.is_html: msg = "the response content is HTML, not XML feed" self._log(msg) @@ -260,8 +259,6 @@ class FeedFinder: msg = f"warnings: {';'.join(result.warnings)}" self._log(msg) LOG.warning(msg) - parser = FeedParser() - result = parser.parse(result) return result def _parse_html(self, response): @@ -393,7 +390,7 @@ class FeedFinder: self._guess_links() self._guessed = True - def find(self) -> Tuple[FeedResponse, FeedResult]: + def find(self) -> Tuple[FeedResponse, RawFeedResult]: use_proxy = False current_try = 0 should_abort = FeedResponseStatus.is_permanent_failure diff --git a/rssant_feedlib/fulltext.py b/rssant_feedlib/fulltext.py index 50b936cfca1386dc2c1d0b46210f8cc1aaca030f..8a7acd754b9ecb9dbec9b68abfe936bdfc428420 100644 --- a/rssant_feedlib/fulltext.py +++ b/rssant_feedlib/fulltext.py @@ -12,7 +12,7 @@ sentence_sep_s = [ r'“', r'”', r'‘', r'’', r'【', r'】', r'《', r'》', r'(', r')', r'〈', r'〉', ] -RE_SENTENCE_SEP = re.compile(r'(?:\s*(?:{})\s*)+'.format('|'.join(sentence_sep_s))) +RE_SENTENCE_SEP = re.compile(r'(?:[\s\d]*(?:{})[\s\d]*)+'.format('|'.join(sentence_sep_s))) def split_sentences(text: str) -> List[str]: diff --git a/rssant_feedlib/parser.py b/rssant_feedlib/parser.py index 3a7df1ac52ecf06f923ac12bb5f2c499dad70b46..4e2852936693f37d496b50ee0aab9edfd671cb73 100644 --- a/rssant_feedlib/parser.py +++ b/rssant_feedlib/parser.py @@ -9,7 +9,7 @@ from .raw_parser import RawFeedResult, FeedParserError from .feed_checksum import FeedChecksum from rssant_api.helper import shorten from .processor import ( - story_html_to_text, story_html_clean, + story_html_to_text, story_html_clean, story_extract_attach, story_has_mathjax, process_story_links, normalize_url, validate_url, ) @@ -42,6 +42,8 @@ StorySchema = T.dict( summary=T.str.maxlen(_MAX_SUMMARY_LENGTH).optional, has_mathjax=T.bool.optional, image_url=T.url.invalid_to_default.optional, + iframe_url=T.url.invalid_to_default.optional, + audio_url=T.url.invalid_to_default.optional, dt_published=T.datetime.object.optional, dt_updated=T.datetime.object.optional, author_name=T.str.maxlen(100).optional, @@ -117,6 +119,14 @@ class FeedParser: ) def _process_content(self, content, link): + # use loose=True to reserve iframe + content = story_html_clean(content, loose=True) + # extract video iframe, eg: bilibili.com + attach = None + is_short_story = content and len(content) < 2000 + if is_short_story: + attach = story_extract_attach(content, base_url=link) + # clean again, remove iframe from content content = story_html_clean(content) content = process_story_links(content, link) if len(content) > _MAX_CONTENT_LENGTH: @@ -127,7 +137,7 @@ class FeedParser: msg = 'story link=%r content length=%s still too large, will truncate it' LOG.warning(msg, link, len(content)) content = content[:_MAX_CONTENT_LENGTH] - return content + return content, attach def _parse_story(self, story: dict, feed_url: str): ident = story['ident'][:200] @@ -139,10 +149,16 @@ class FeedParser: valid_url = None base_url = valid_url or feed_url image_url = normalize_url(story['image_url'], base_url=base_url) + audio_url = normalize_url(story['audio_url'], base_url=base_url) author_name = story_html_to_text(story['author_name'])[:100] author_url = normalize_url(story['author_url'], base_url=base_url) author_avatar_url = normalize_url(story['author_avatar_url'], base_url=base_url) - content = self._process_content(story['content'], link=base_url) + iframe_url = None + content, attach = self._process_content(story['content'], link=base_url) + if attach: + iframe_url = attach.iframe_url + if (not audio_url) and attach.audio_url: + audio_url = attach.audio_url if story['summary']: summary = story_html_clean(story['summary']) else: @@ -157,6 +173,8 @@ class FeedParser: summary=summary, has_mathjax=has_mathjax, image_url=image_url, + iframe_url=iframe_url, + audio_url=audio_url, dt_published=story['dt_published'], dt_updated=story['dt_updated'], author_name=author_name, diff --git a/rssant_feedlib/processor.py b/rssant_feedlib/processor.py index 32b58621634417105f16e4b53974d604ffb1d864..f155f730b5f731b47dc74dfa40966fa39e6561b9 100644 --- a/rssant_feedlib/processor.py +++ b/rssant_feedlib/processor.py @@ -364,6 +364,34 @@ def story_readability(content): return doc.summary(html_partial=True) or "" +StoryAttach = namedtuple("StoryAttach", "iframe_url, audio_url") + + +def _normalize_validate_url(url, base_url=None): + url = normalize_url(url, base_url=base_url) + if not url: + return None + try: + url = validate_url(url) + except Invalid: + url = None + return url + + +def story_extract_attach(html, base_url=None) -> StoryAttach: + iframe_url = None + audio_url = None + dom = lxml_call(lxml.html.fromstring, html) + iframe_el = dom.find('.//iframe') + if iframe_el is not None: + iframe_url = _normalize_validate_url(iframe_el.get('src'), base_url=base_url) + audio_el = dom.find('.//audio') + if audio_el is not None: + audio_url = _normalize_validate_url(audio_el.get('src'), base_url=base_url) + attach = StoryAttach(iframe_url, audio_url) + return attach + + RE_BLANK_LINE = re.compile(r'(\n\s*)(\n\s*)+') lxml_html_parser = lxml.html.HTMLParser( @@ -446,7 +474,7 @@ def story_html_to_text(content, clean=True): RSSANT_HTML_SAFE_ATTRS = set(lxml_safe_attrs) | set(IMG_EXT_SRC_ATTRS) RSSANT_HTML_SAFE_ATTRS.update({'srcset'}) -lxml_story_html_cleaner = Cleaner( +_html_cleaner_options = dict( scripts=True, javascript=True, comments=True, @@ -455,7 +483,6 @@ lxml_story_html_cleaner = Cleaner( meta=True, page_structure=True, processing_instructions=True, - embedded=True, frames=True, forms=True, annoying_tags=True, @@ -463,11 +490,42 @@ lxml_story_html_cleaner = Cleaner( safe_attrs=RSSANT_HTML_SAFE_ATTRS, add_nofollow=True, remove_tags=set(['body']), - kill_tags=set(['noscript']), + kill_tags=set(['noscript', 'iframe', 'embed']), +) + + +class FeedLooseHTMLCleaner(Cleaner): + """ + https://lxml.de/api/lxml.html.clean.Cleaner-class.html + https://lxml.de/api/lxml.html.clean-pysrc.html#Cleaner.allow_embedded_url + """ + + def allow_embedded_url(self, el, url): + """ + Decide whether a URL that was found in an element's attributes or text + if configured to be accepted or rejected. + + :param el: an element. + :param url: a URL found on the element. + :return: true to accept the URL and false to reject it. + """ + if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: + return False + return True + + +lxml_story_html_cleaner = Cleaner( + **_html_cleaner_options, + embedded=True, +) +lxml_story_html_loose_cleaner = FeedLooseHTMLCleaner( + **_html_cleaner_options, + embedded=False, # allow iframe + whitelist_tags=['iframe'], ) -def story_html_clean(content): +def story_html_clean(content, loose=False): """ >>> content = ''' ...
@@ -493,11 +551,21 @@ def story_html_clean(content):
     >>> # lxml can not parse below content, we handled the exception
     >>> content = ''
     >>> assert story_html_clean(content)
+    >>> # loose cleaner allow iframe, not allow embed flash
+    >>> content = ''
+    >>> story_html_clean(content)
+    '
' + >>> 'iframe' in story_html_clean(content, loose=True) + True + >>> content = '' + >>> story_html_clean(content, loose=True) + '
' """ if (not content) or (not content.strip()): return "" + cleaner = lxml_story_html_loose_cleaner if loose else lxml_story_html_cleaner try: - content = lxml_call(lxml_story_html_cleaner.clean_html, content).strip() + content = lxml_call(cleaner.clean_html, content).strip() except LXMLError as ex: LOG.info(f'lxml unable to parse content: {ex} content={content!r}', exc_info=ex) content = html_escape(content) diff --git a/rssant_feedlib/raw_parser.py b/rssant_feedlib/raw_parser.py index 0d3a95fd33c2ed4f21c558538e06172f75bb4dca..81ae5338f1e6f3dc0351e2659610d334e88e2206 100644 --- a/rssant_feedlib/raw_parser.py +++ b/rssant_feedlib/raw_parser.py @@ -6,6 +6,7 @@ import time from io import BytesIO import atoma +from atoma.json_feed import JSONFeedItem as RawJSONFeedItem import feedparser from django.utils import timezone from dateutil.parser import parse as parse_datetime @@ -48,6 +49,7 @@ RawStorySchema = T.dict( content=T.str.maxlen(_MAX_CONTENT_LENGTH).optional, summary=T.str.maxlen(_MAX_SUMMARY_LENGTH).optional, image_url=T.str.optional, + audio_url=T.str.optional, dt_published=T.datetime.object.optional, dt_updated=T.datetime.object.optional, author_name=T.str.optional, @@ -124,15 +126,26 @@ class RawFeedParser: url = detail.get('href') return dict(author_name=name, author_url=url, author_avatar_url=avatar) - def _get_story_image_url(self, item) -> str: + def _get_story_enclosure_url(self, item, mime_type) -> str: if item.get('enclosures'): for e in item['enclosures']: - mime_type = e.get('type') + el_mime_type = e.get('type') url = e.get('href') - if url and mime_type and 'image' in mime_type: + if url and el_mime_type and mime_type in el_mime_type: return url return None + def _get_story_image_url(self, item) -> str: + image = item.get('image') + if image: + url = image.get('href') + if url: + return url + return self._get_story_enclosure_url(item, 'image') + + def _get_story_audio_url(self, item) -> str: + return self._get_story_enclosure_url(item, 'audio') + def _get_story_content(self, item) -> str: content = '' if item.get("content"): @@ -214,6 +227,7 @@ class RawFeedParser: story['content'] = self._get_story_content(item) story['summary'] = item.get("summary") story['image_url'] = self._get_story_image_url(item) + story['audio_url'] = self._get_story_audio_url(item) story['dt_published'] = self._get_date(item, 'published_parsed') story['dt_updated'] = self._get_date(item, 'updated_parsed') story.update(self._get_author_info(item)) @@ -250,6 +264,36 @@ class RawFeedParser: raise FeedParserError("JSON parse error: {}".format(ex)) from ex return data + def _get_json_feed_story_audio_url(self, item: RawJSONFeedItem) -> str: + if not item.attachments: + return None + for x in item.attachments: + if x.url and x.mime_type and 'audio' in x.mime_type: + return x.url + return None + + def _get_json_feed_story(self, item: RawJSONFeedItem): + ident = item.id_ or item.url or item.title + if not ident: + return None + content = item.content_html or item.content_text or item.summary or '' + summary = item.summary if item.summary != content else None + audio_url = self._get_json_feed_story_audio_url(item) + story = dict( + ident=ident, + url=item.url, + title=item.title or ident, + content=content, + summary=summary, + image_url=item.image or item.banner_image, + audio_url=audio_url, + dt_published=item.date_published, + dt_updated=item.date_modified, + **self._get_json_feed_author(item.author), + ) + story = self._normalize_story_content_summary(story) + return story + def _parse_json_feed(self, response: FeedResponse) -> RawFeedResult: data = self._load_json(response) if not isinstance(data, dict): @@ -270,26 +314,11 @@ class RawFeedParser: ) warnings = [] storys = [] - item: atoma.JSONFeedItem for i, item in enumerate(feed.items or []): - ident = item.id_ or item.url or item.title - if not ident: + story = self._get_json_feed_story(item) + if not story: warnings.append(f"story#{i} no id, skip it") continue - content = item.content_html or item.content_text or item.summary or '' - summary = item.summary if item.summary != content else None - story = dict( - ident=ident, - url=item.url, - title=item.title or ident, - content=content, - summary=summary, - image_url=item.image or item.banner_image, - dt_published=item.date_published, - dt_updated=item.date_modified, - **self._get_json_feed_author(item.author), - ) - story = self._normalize_story_content_summary(story) storys.append(story) if (not storys) and warnings: raise FeedParserError('; '.join(warnings)) diff --git a/rssant_harbor/actors/rss.py b/rssant_harbor/actors/rss.py index d40b2831d256a75567eb72c16789baee1614d8dc..acbce8a5018a1e17a648d6fb4f11b765d5c713e8 100644 --- a/rssant_harbor/actors/rss.py +++ b/rssant_harbor/actors/rss.py @@ -30,6 +30,9 @@ StorySchemaFields = dict( content_hash_base64=T.str, author=T.str.optional, link=T.str.optional, + image_url=T.url.optional, + iframe_url=T.url.optional, + audio_url=T.url.optional, has_mathjax=T.bool.optional, dt_published=T.datetime.object.optional.invalid_to_default, dt_updated=T.datetime.object.optional, @@ -198,7 +201,7 @@ def do_update_feed( for story in modified_storys: if not story.link: continue - if need_fetch_story and (not is_fulltext_content(story.content)): + if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell('worker_rss.fetch_story', dict( @@ -211,6 +214,12 @@ def do_update_feed( _detect_story_images(ctx, story) +def is_fulltext_story(story): + if story.iframe_url or story.audio_url: + return True + return is_fulltext_content(story.content) + + def is_rssant_changelog(url: str): """ >>> is_rssant_changelog('http://localhost:6789/changelog?version=1.0.0') diff --git a/rssant_worker/actors/rss.py b/rssant_worker/actors/rss.py index c47f83dab81448ded58da0a867af0dc81f88e280..c296dd5006de20054fd841f231203701b10e46d6 100644 --- a/rssant_worker/actors/rss.py +++ b/rssant_worker/actors/rss.py @@ -52,6 +52,9 @@ StorySchema = T.dict( content_hash_base64=T.str, author=T.str.optional, link=T.url.optional, + image_url=T.url.optional, + iframe_url=T.url.optional, + audio_url=T.url.optional, has_mathjax=T.bool.optional, dt_published=T.datetime.optional, dt_updated=T.datetime.optional, @@ -390,6 +393,9 @@ def _get_storys(entries: list): title = data['title'] story['has_mathjax'] = data['has_mathjax'] story['link'] = data['url'] + story['image_url'] = data['image_url'] + story['audio_url'] = data['audio_url'] + story['iframe_url'] = data['iframe_url'] story['summary'] = summary story['content'] = content content_hash_base64 = compute_hash_base64(content, summary, title) diff --git a/tests/feedlib/processor/test_processor.py b/tests/feedlib/processor/test_processor.py index 842c1b52f67f0169719241963881f4badd10ad58..14f43bda224104b4a2371bfced98f984c7c84304 100644 --- a/tests/feedlib/processor/test_processor.py +++ b/tests/feedlib/processor/test_processor.py @@ -7,6 +7,7 @@ from rssant_feedlib.processor import ( normalize_url, validate_url, get_html_redirect_url, + story_extract_attach, ) _data_dir = Path(__file__).parent.parent / 'testdata/processor' @@ -102,3 +103,17 @@ def test_get_html_redirect_url(filename): html = _read_text(filename) got = get_html_redirect_url(html, base_url=base_url) assert got == expect + + +def test_story_extract_attach_iframe(): + html = _read_text('test_iframe.html') + attach = story_extract_attach(html) + expect = 'https://player.bilibili.com/player.html?aid=75057811' + assert attach.iframe_url == expect + + +def test_story_extract_attach_audio(): + html = _read_text('test_audio.html') + attach = story_extract_attach(html) + expect = 'https://chtbl.com/track/r.typlog.com/pythonhunter/8417630310_189758.mp3' + assert attach.audio_url == expect diff --git a/tests/feedlib/test_parser.py b/tests/feedlib/test_parser.py index 8c01ae8d2a84975f7f6d6f9ca7e9bea3e1998b5b..57a21d500a97f1cc6463a8e0a975eb5c0cab268b 100644 --- a/tests/feedlib/test_parser.py +++ b/tests/feedlib/test_parser.py @@ -1,5 +1,6 @@ import logging import os +import re import json import datetime from pathlib import Path @@ -7,7 +8,7 @@ from pathlib import Path import pytest from rssant_feedlib import ( - RawFeedParser, FeedParser, + RawFeedParser, FeedParser, FeedResult, FeedParserError, FeedResponseBuilder, ) from rssant_feedlib.raw_parser import _MAX_CONTENT_LENGTH as _RAW_MAX_CONTENT_LENGTH @@ -257,3 +258,38 @@ def test_parser_and_checksum(filepath): assert result.feed assert result.storys assert result.checksum.size() == len(result.storys) + + +def _parse_well_feed(filename) -> FeedResult: + response = _read_response(_data_dir / 'well', filename) + raw_result = RawFeedParser().parse(response) + assert raw_result.feed + assert raw_result.storys + assert not raw_result.warnings + result = FeedParser().parse(raw_result) + assert len(result.storys) == len(raw_result.storys) + return result + + +def test_parser_iframe(): + result = _parse_well_feed('bilibili_iframe.xml') + expect = r'https://player\.bilibili\.com/player\.html\?aid=\d+' + for story in result.storys: + assert story['iframe_url'] + assert re.match(expect, story['iframe_url']) + + +def test_parser_audio_typlog(): + result = _parse_well_feed('typlog_audio.xml') + expect = r'https://chtbl\.com/track/6AGABB/r\.typlog\.com/.+\.mp3' + for story in result.storys: + assert story['audio_url'] + assert re.match(expect, story['audio_url']) + + +def test_parser_audio_jsonfeed(): + result = _parse_well_feed('jsonfeed_audio.json') + expect = r'http://therecord\.co/downloads/.+\.m4a' + for story in result.storys: + assert story['audio_url'] + assert re.match(expect, story['audio_url']) diff --git a/tests/feedlib/testdata/parser/well/bilibili_iframe.xml b/tests/feedlib/testdata/parser/well/bilibili_iframe.xml new file mode 100644 index 0000000000000000000000000000000000000000..37aea6e7cb715322cd81bad087415ed0601296cd --- /dev/null +++ b/tests/feedlib/testdata/parser/well/bilibili_iframe.xml @@ -0,0 +1,180 @@ + + + + <![CDATA[极客之选 的 bilibili 频道 极客博物馆(怀旧产品)]]> + https://space.bilibili.com/142821407/#/channel/detail?cid=49017 + + + RSSHub + i@diygod.me (DIYgod) + + + zh-cn + + Fri, 17 Apr 2020 12:30:00 GMT + 60 + + + <![CDATA[五年前拿奖到手软,这款锤子手机的设计今天看依然惊艳【极客博物馆第五十二期】]]> +

]]>
+ Sat, 07 Dec 2019 11:09:36 GMT + https://www.bilibili.com/video/av78398082 + https://www.bilibili.com/video/av78398082 + + + + + + + + +
+ + + <![CDATA[第一款圆形智能手表,摩托罗拉在五年前引发了手表刷机热【极客博物馆第五十一期】]]> +

]]>
+ Sat, 23 Nov 2019 13:30:24 GMT + https://www.bilibili.com/video/av76732820 + https://www.bilibili.com/video/av76732820 + + + + + + + + +
+ + + <![CDATA[有英伟达和 Beats 加持,七年前这款 HTC 旗舰不输给 iPhone【极客博物馆第五十期】]]> +

]]>
+ Sat, 09 Nov 2019 12:00:47 GMT + https://www.bilibili.com/video/av75065184 + https://www.bilibili.com/video/av75065184 + + + + + + + + +
+ + + <![CDATA[苹果 15 年前一万块的笔记本,竟然比 MacBook 还「良心」?【极客博物馆第四十九期】]]> +

]]>
+ Sat, 19 Oct 2019 13:00:06 GMT + https://www.bilibili.com/video/av71745090 + https://www.bilibili.com/video/av71745090 + + + + + + + + +
+ + + <![CDATA[六年前诺基亚这款「奥利奥」手机,曾把拍照做出了新高度【极客博物馆第四十八期】]]> +

]]>
+ Sat, 05 Oct 2019 12:42:25 GMT + https://www.bilibili.com/video/av70208637 + https://www.bilibili.com/video/av70208637 + + + + + + + + +
+ + + <![CDATA[首款多彩 iPhone 销量惨淡,但它为苹果最畅销的机型打下基石【极客博物馆第四十七期】]]> +

]]>
+ Sun, 22 Sep 2019 11:18:15 GMT + https://www.bilibili.com/video/av68649956 + https://www.bilibili.com/video/av68649956 + + + + + + + + +
+ + + <![CDATA[它是乔布斯最后的得意之作,时隔九年依然经典【极客博物馆第四十六期】]]> +

]]>
+ Sun, 01 Sep 2019 01:00:26 GMT + https://www.bilibili.com/video/av66201335 + https://www.bilibili.com/video/av66201335 + + + + + + + + +
+ + + <![CDATA[十年前,它是国产智能机的骄傲【极客博物馆第四十五期】]]> +

]]>
+ Sun, 25 Aug 2019 02:00:18 GMT + https://www.bilibili.com/video/av65235295 + https://www.bilibili.com/video/av65235295 + + + + + + + + +
+ + + <![CDATA[全球卖出一亿台,任天堂 13 年前这款游戏机打赢索尼和微软【极客博物馆 vol.44】]]> +

]]>
+ Sun, 11 Aug 2019 01:00:13 GMT + https://www.bilibili.com/video/av63159500 + https://www.bilibili.com/video/av63159500 + + + + + + + + +
+ + + <![CDATA[九年前售价不到 400 块,这款苹果入门产品让很多人成为果粉【极客博物馆第 43 期】]]> +

]]>
+ Sun, 28 Jul 2019 01:00:14 GMT + https://www.bilibili.com/video/av61046186 + https://www.bilibili.com/video/av61046186 + + + + + + + + +
+ +
+
diff --git a/tests/feedlib/testdata/parser/well/jsonfeed_audio.json b/tests/feedlib/testdata/parser/well/jsonfeed_audio.json new file mode 100644 index 0000000000000000000000000000000000000000..eee0ca45cd612a054dfc2f29fc91f09fb2f3ece1 --- /dev/null +++ b/tests/feedlib/testdata/parser/well/jsonfeed_audio.json @@ -0,0 +1,26 @@ +{ + "version": "https://jsonfeed.org/version/1", + "user_comment": "This is a podcast feed. You can add this feed to your podcast client using the following URL: http://therecord.co/feed.json", + "title": "The Record", + "home_page_url": "http://therecord.co/", + "feed_url": "http://therecord.co/feed.json", + "items": [ + { + "id": "http://therecord.co/chris-parrish", + "title": "Special #1 - Chris Parrish", + "url": "http://therecord.co/chris-parrish", + "content_text": "Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.", + "content_html": "Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.", + "summary": "Brent interviews Chris Parrish, co-host of The Record and one-half of Aged & Distilled.", + "date_published": "2014-05-09T14:04:00-07:00", + "attachments": [ + { + "url": "http://therecord.co/downloads/The-Record-sp1e1-ChrisParrish.m4a", + "mime_type": "audio/x-m4a", + "size_in_bytes": 89970236, + "duration_in_seconds": 6629 + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/feedlib/testdata/parser/well/typlog_audio.xml b/tests/feedlib/testdata/parser/well/typlog_audio.xml new file mode 100644 index 0000000000000000000000000000000000000000..94b82f307b316bf7405c88116385bd00a6f01227 --- /dev/null +++ b/tests/feedlib/testdata/parser/well/typlog_audio.xml @@ -0,0 +1,252 @@ + + + +Typlog (https://typlog.com)<![CDATA[ 捕蛇者说 ]]>http://pythonhunter.org/zhepisodicThu, 16 Apr 2020 01:13:07 +0000https://i.typlog.com/pythonhunter/8444690454_041962.png?x-oss-process=style/sl<![CDATA[ 捕蛇者说 ]]>http://pythonhunter.org/no + +<![CDATA[ Ep 07. Lisp 程序员眼中的 Python? ]]>http://pythonhunter.org/episodes/7http://pythonhunter.org/episodes/740:44nofullHi!本期我们请到了稀有的 Lisp 程序员 David,来和我们聊一聊函数式编程的一些概念,Common Lisp 的特点,如何入门函数式编程。

+

本期音频由 laixintao 剪辑,第一次剪辑请大家包涵,如果音频有问题可以在评论或者 telegram 群提出。

+

本期嘉宾&主持:

  1. David Twitter Blog
  2. +
  3. laixintao Twitter Blog
  4. +
+

内容提要:

  1. 01:40 什么是知识图谱
  2. +
  3. 04:00 Lisp 程序员是如何看 Python 的
  4. +
  5. 13:00 函数式编程
  6. +
  7. 20:00 函数式编程的类型推导
  8. +
  9. 24:00 Debug 中的“时间机器”
  10. +
  11. 27:00 Lazy Evaluation
  12. +
  13. 31:00 Python 程序员如何入门 Lisp
  14. +
  15. 37:00 推荐环节
  16. +
+

本期提到的一些资料如下:

  1. R:Past and Future History
  2. +
  3. pampy: Pattern Match for Python
  4. +
  5. Monad
  6. +
  7. Notion (David 之选🎉)
  8. +
  9. 附:《给Lisp程序员的Python简介》
  10. +
+ ]]>
Hi!本期我们请到了稀有的 Lisp 程序员 David,来和我们聊一聊函数式编程的一些概念,Common Lisp 的特点,如何入门函数式编程。

+

本期音频由 laixintao 剪辑,第一次剪辑请大家包涵,如果音频有问题可以在评论或者 telegram 群提出。

+

本期嘉宾&主持:

  1. David Twitter Blog
  2. +
  3. laixintao Twitter Blog
  4. +
+

内容提要:

  1. 01:40 什么是知识图谱
  2. +
  3. 04:00 Lisp 程序员是如何看 Python 的
  4. +
  5. 13:00 函数式编程
  6. +
  7. 20:00 函数式编程的类型推导
  8. +
  9. 24:00 Debug 中的“时间机器”
  10. +
  11. 27:00 Lazy Evaluation
  12. +
  13. 31:00 Python 程序员如何入门 Lisp
  14. +
  15. 37:00 推荐环节
  16. +
+

本期提到的一些资料如下:

  1. R:Past and Future History
  2. +
  3. pampy: Pattern Match for Python
  4. +
  5. Monad
  6. +
  7. Notion (David 之选🎉)
  8. +
  9. 附:《给Lisp程序员的Python简介》
  10. +
+ ]]>
Tue, 22 Oct 2019 08:42:23 +0000
+ +<![CDATA[ [特别篇] PyCon 举办之前,我们在酒店的闲谈 ]]>http://pythonhunter.org/episodes/sp01http://pythonhunter.org/episodes/sp0158:30nofull本期嘉宾:

+ +

明天就是 PyCon,四位主创实现了首次全员聚会。
+听我们一起聊一聊,关于 PyCon 的期待与想法

+ ]]>
本期嘉宾:

+ +

明天就是 PyCon,四位主创实现了首次全员聚会。
+听我们一起聊一聊,关于 PyCon 的期待与想法

+ ]]>
Fri, 20 Sep 2019 16:06:21 +0000
+ +<![CDATA[ Ep 06. 和 the5fire 来聊聊 Django ]]>http://pythonhunter.org/episodes/6http://pythonhunter.org/episodes/61:34:43nofull
print "PyCon 2019 上海站即将开始,我们也会有一个专门的展位参与其中,期待与大家的相见"
+
+

嘉宾:
+- the5fire
+- laike9m
+- laixintao

+

主持:
+- Adam Wen

+

Django,
+适合新手学习吗? 适合实践开发吗? 又有哪些优势呢?
+本期节目,我们将和 the5fire 来聊一聊 Django 的一些好处与特点,推荐一些 Django 的资料与插件,聊聊《Django企业开发实战》这本书。

+

欢迎大家收听本期节目,欢迎大家评论留言,我们将会从官网+知乎评论区抽出 5 位幸运听众送出《Django企业开发实战》

+ ]]>
print "PyCon 2019 上海站即将开始,我们也会有一个专门的展位参与其中,期待与大家的相见"
+
+

嘉宾:
+- the5fire
+- laike9m
+- laixintao

+

主持:
+- Adam Wen

+

Django,
+适合新手学习吗? 适合实践开发吗? 又有哪些优势呢?
+本期节目,我们将和 the5fire 来聊一聊 Django 的一些好处与特点,推荐一些 Django 的资料与插件,聊聊《Django企业开发实战》这本书。

+

欢迎大家收听本期节目,欢迎大家评论留言,我们将会从官网+知乎评论区抽出 5 位幸运听众送出《Django企业开发实战》

+ ]]>
Fri, 20 Sep 2019 08:14:17 +0000
+ + +<![CDATA[ Ep 05. 聊聊单元测试最佳实践和 PEP 572 ]]>http://pythonhunter.org/episodes/5http://pythonhunter.org/episodes/552:56nofull本期主持:

+ +

时间点:

+
  • 00:00:52 Mocking and Patching pitfalls
  • +
  • 00:30:41 PEP 572
  • +
  • 00:49:19 推荐环节
  • +
+

视频和链接

第一个视频 《Edwin Jung - Mocking and Patching Pitfalls - PyCon 2019》

+

第二个视频 《Dustin Ingram - PEP 572: The Walrus Operator - PyCon 2019》

+

推荐

+ ]]>
本期主持:

+ +

时间点:

+
  • 00:00:52 Mocking and Patching pitfalls
  • +
  • 00:30:41 PEP 572
  • +
  • 00:49:19 推荐环节
  • +
+

视频和链接

第一个视频 《Edwin Jung - Mocking and Patching Pitfalls - PyCon 2019》

+

第二个视频 《Dustin Ingram - PEP 572: The Walrus Operator - PyCon 2019》

+

推荐

+ ]]>
Fri, 09 Aug 2019 07:39:27 +0000
+ + +<![CDATA[ Ep 04. 主播带你逛 PyCon ]]>http://pythonhunter.org/episodes/4http://pythonhunter.org/episodes/41:01:04nofull本期主创 +

本期提要

  • 00:01:30 什么是 PSF
  • +
  • 00:06:50 PyCon 是什么
  • +
  • 00:10:00 主创们参加 PyCon 的经历
  • +
  • 00:18:20 laike9m 在 PyCon 2019 的经历
  • +
  • 00:31:34 Manjusakalaixintao 的组织经历
  • +
  • 00:44:32 如何参与到 PyCon 的组织中来
  • +
  • 00:59:18 下期预告
  • +
+

不是广告的广告

PyCon2019 中国Python开发者大会 - 上海站已经开始售票啦
+参与 PyCon China 2019,和主播们现场面基,还有机会获得捕蛇者说绝版 T恤!

+

相关链接

+ ]]>
本期主创 +

本期提要

  • 00:01:30 什么是 PSF
  • +
  • 00:06:50 PyCon 是什么
  • +
  • 00:10:00 主创们参加 PyCon 的经历
  • +
  • 00:18:20 laike9m 在 PyCon 2019 的经历
  • +
  • 00:31:34 Manjusakalaixintao 的组织经历
  • +
  • 00:44:32 如何参与到 PyCon 的组织中来
  • +
  • 00:59:18 下期预告
  • +
+

不是广告的广告

PyCon2019 中国Python开发者大会 - 上海站已经开始售票啦
+参与 PyCon China 2019,和主播们现场面基,还有机会获得捕蛇者说绝版 T恤!

+

相关链接

+ ]]>
Mon, 22 Jul 2019 11:10:57 +0000
+ +<![CDATA[ Ep 01. 新人到底需要什么 ]]>http://pythonhunter.org/episodes/1http://pythonhunter.org/episodes/11:32:01nofull本期主持:

+ +

播客中提到的内容:

+ + ]]>
本期主持:

+ +

播客中提到的内容:

+ + ]]>
Sun, 14 Apr 2019 06:50:29 +0000
\ No newline at end of file diff --git a/tests/feedlib/testdata/processor/test_audio.html b/tests/feedlib/testdata/processor/test_audio.html new file mode 100644 index 0000000000000000000000000000000000000000..dd350304fc9bd2d0615e08f74678cc4bd1778ce4 --- /dev/null +++ b/tests/feedlib/testdata/processor/test_audio.html @@ -0,0 +1,163 @@ +
+ +Ep 12. 和 F叔 聊一下服务化的那些事 - 捕蛇者说 + +
+ +
+
+
+ +

Ep 12. 和 F叔 聊一下服务化的那些事

+

本期我们和 F叔 聊了项目的服务化以及服务化过程中的各类问题和处理方式。

+
+
+ +
+
+

本期主播

+ +

本期嘉宾

+
+
+

时间线

+
    +
  • 00:00:33 开场
  • +
  • 00:00:46 自我介绍
  • +
  • 00:01:55 如何接触的 Python?
  • +
  • 00:04:29 日常对比 Python 与 JAVA
  • +
  • 00:11:00 什么是微服务(服务化)?
  • +
  • 00:14:20 推进微服务(服务化)时如何拆分业务?不同团队应如何配合?
  • +
  • 00:25:47 赖信涛同学“乱入”
  • +
  • 00:26:31 微服务(服务化)中的调用链路以及工具
  • +
  • 00:43:55 微服务(服务化)框架推荐
  • +
  • 00:48:08 RPC 框架需要做鉴权吗?
  • +
  • 00:52:25 怎么看待现在 Service Mesh 的方案?
  • +
  • 00:54:35 服务治理怎么做?需要关心那些事情?
  • +
  • 00:59:31 嘉宾推荐
  • +
+
+

相关链接

+
+
+ +
+
+
+ + + +
+

Listen This

+ +
+ +
+ +
+
+
+ +
+
+ + +
+
+
+
+ Typlog + +
+
+ + +
\ No newline at end of file diff --git a/tests/feedlib/testdata/processor/test_iframe.html b/tests/feedlib/testdata/processor/test_iframe.html new file mode 100644 index 0000000000000000000000000000000000000000..d9ba6e44090d7f9762556924681cd2b5ddb67097 --- /dev/null +++ b/tests/feedlib/testdata/processor/test_iframe.html @@ -0,0 +1 @@ +《怪物猎人 世界:冰原世纪》x《生化危机2 重制版》联动任务


\ No newline at end of file