From 0a7497fba6586f8b944317bf325cf7fb1a73b5fc Mon Sep 17 00:00:00 2001 From: Stuart Longland Date: Sun, 15 Oct 2017 08:32:18 +1000 Subject: [PATCH] Handle entries that have no HTML content. Some sites, notably ABC News, provide just plain-text content or a practically identical HTML summary, resulting in neither appearing on the feed. This checks the output of the HTML content extraction and falls back to the summary if that comes up empty. --- tornadonews/tornadonews.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tornadonews/tornadonews.py b/tornadonews/tornadonews.py index 6fd815d..f7ea612 100644 --- a/tornadonews/tornadonews.py +++ b/tornadonews/tornadonews.py @@ -69,10 +69,15 @@ class FeedEntry(object): Parse the feedparser-generated entry dict and return a FeedEntry object from it. """ - if 'content' in entry: + content = None + + if entry.get('content'): html_content = filter(lambda c : 'html' in c['type'], entry['content']) content = ''.join([c['value'] for c in html_content]) - else: + + # If the content is empty or not present, then use summary. + # ABC news gives plain text (not HTML) content. + if not content: content = entry['summary'] try: