From 1792e683f3da175fee2d9e18205847864bbc45af Mon Sep 17 00:00:00 2001 From: aikaterna <20862007+aikaterna@users.noreply.github.com> Date: Sat, 12 Feb 2022 18:06:17 -0800 Subject: [PATCH] [RSS] Various fixes * Handle feeds that don't have a link in the feed entry * Do not check title matching if a feed entry has a non-matching link and a greater entry time than last post time * Fix for $tags_list template key as it was setting it to the last tag found instead of the entire list, also check for duplicate tags during tag finding and do not add them --- rss/rss.py | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/rss/rss.py b/rss/rss.py index 8f11499..f53cbc0 100644 --- a/rss/rss.py +++ b/rss/rss.py @@ -29,7 +29,7 @@ IPV4_RE = re.compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}") IPV6_RE = re.compile("([a-f0-9:]+:+)+[a-f0-9]+") -__version__ = "1.7.0" +__version__ = "1.8.0" class RSS(commands.Cog): @@ -131,6 +131,7 @@ class RSS(commands.Cog): """Append bs4-discovered tags to an rss_feed/feedparser object.""" rss_object["is_special"] = [] soup = None + tags_list = [] temp_rss_obect = copy.deepcopy(rss_object) for tag_name, tag_content in temp_rss_obect.items(): @@ -164,7 +165,6 @@ class RSS(commands.Cog): rss_object[f"{tag_name}_plaintext"] = self._add_generic_html_plaintext(soup) if tag_content_check == TagType.LIST: - tags_list = [] tags_content_counter = 0 for list_item in tag_content: @@ -218,7 +218,7 @@ class RSS(commands.Cog): name = f"media_plaintext{str(enclosure_content_counter).zfill(2)}" rss_object[name] = image_url rss_object["is_special"].append(name) - tags_list.append(tag) + tags_list.append(tag) if tag not in tags_list else tags_list except KeyError: pass @@ -229,15 +229,15 @@ class RSS(commands.Cog): name = f"{tag_name}_plaintext{str(tags_content_counter).zfill(2)}" rss_object[name] = tag rss_object["is_special"].append(name) - tags_list.append(tag) + tags_list.append(tag) if tag not in tags_list else tags_list except KeyError: pass - if len(tags_list) > 0: - rss_object["tags_list"] = tags_list - rss_object["tags_plaintext_list"] = humanize_list(tags_list) - rss_object["is_special"].append("tags_list") - rss_object["is_special"].append("tags_plaintext_list") + if len(tags_list) > 0: + rss_object["tags_list"] = tags_list + rss_object["tags_plaintext_list"] = humanize_list(tags_list) + rss_object["is_special"].append("tags_list") + rss_object["is_special"].append("tags_plaintext_list") # if image dict tag exists, check for an image try: @@ -458,16 +458,20 @@ class RSS(commands.Cog): """ entry_time = await self._time_tag_validation(feedparser_plus_obj) - # sometimes there's no title attribute and feedparser doesn't really play nice with that + # sometimes there's no title or no link attribute and feedparser doesn't really play nice with that try: feedparser_plus_obj_title = feedparser_plus_obj["title"] except KeyError: feedparser_plus_obj_title = "" + try: + feedparser_plus_obj_link = feedparser_plus_obj["link"] + except KeyError: + feedparser_plus_obj_link = "" rss_object = RssFeed( name=feed_name.lower(), last_title=feedparser_plus_obj_title, - last_link=feedparser_plus_obj["link"], + last_link=feedparser_plus_obj_link, last_time=entry_time, template="$title\n$link", url=url, @@ -1343,15 +1347,23 @@ class RSS(commands.Cog): title = sorted_feed_by_post_time[0].title except AttributeError: title = "" - await self._update_last_scraped(channel, name, title, sorted_feed_by_post_time[0].link, entry_time) + try: + link = sorted_feed_by_post_time[0].link + except AttributeError: + link = "" + await self._update_last_scraped(channel, name, title, link, entry_time) feedparser_plus_objects = [] for entry in sorted_feed_by_post_time: - # sometimes there's no title attribute and feedparser doesn't really play nice with that + # sometimes there's no title or no link attribute and feedparser doesn't really play nice with that try: entry_title = entry.title except AttributeError: entry_title = "" + try: + entry_link = entry.link + except AttributeError: + entry_link = "" # find the updated_parsed (checked first) or an published_parsed tag if they are present entry_time = await self._time_tag_validation(entry) @@ -1370,16 +1382,16 @@ class RSS(commands.Cog): # this will only work for rss sites that are single-use like cloudflare status, discord status, etc # where an update on the last post should be posted # this can be overridden by a bot owner in the rss parse command, per problematic website - if (last_title == entry_title) and (last_link == entry.link) and (entry_time > last_time): + if (last_title == entry_title) and (last_link == entry_link) and (last_time < entry_time): log.debug(f"New update found for an existing post in {name} on cid {channel.id}") feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) feedparser_plus_objects.append(feedparser_plus_obj) # regular feed qualification after this - if (last_title != entry_title) and (last_link != entry.link) and (last_time < entry_time): - log.debug(f"New entry found via time validation for feed {name} on cid {channel.id}") + if (last_link != entry_link) and (last_time < entry_time): + log.debug(f"New entry found via time and link validation for feed {name} on cid {channel.id}") feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) feedparser_plus_objects.append(feedparser_plus_obj) - if (last_title == "" and entry_title == "") and (last_link != entry.link) and (last_time < entry_time): + if (last_title == "" and entry_title == "") and (last_link != entry_link) and (last_time < entry_time): log.debug(f"New entry found via time validation for feed {name} on cid {channel.id} - no title") feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) feedparser_plus_objects.append(feedparser_plus_obj) @@ -1387,7 +1399,7 @@ class RSS(commands.Cog): # this is a post that has no time information attached to it and we can only # verify that the title and link did not match the previously posted entry elif (entry_time or last_time) is None: - if last_title == entry_title and last_link == entry.link: + if last_title == entry_title and last_link == entry_link: log.debug(f"Breaking rss entry loop for {name} on {channel.id}, via link match") break else: