diff --git a/rss/rss.py b/rss/rss.py index c113221..bb3f0cb 100644 --- a/rss/rss.py +++ b/rss/rss.py @@ -25,7 +25,7 @@ from .tag_type import INTERNAL_TAGS, VALID_IMAGES, TagType log = logging.getLogger("red.aikaterna.rss") -__version__ = "1.3.8" +__version__ = "1.4.0" class RSS(commands.Cog): @@ -42,7 +42,7 @@ class RSS(commands.Cog): self._read_feeds_loop = None - self._headers = {'User-Agent': 'Python/3.8'} + self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'} def initialize(self): self._read_feeds_loop = self.bot.loop.create_task(self.read_feeds()) @@ -73,11 +73,16 @@ class RSS(commands.Cog): if not rss_exists: feedparser_obj = await self._fetch_feedparser_object(url) if not feedparser_obj: - await ctx.send("Couldn't fetch that feed for some reason.") + await ctx.send("Couldn't fetch that feed: there were no feed objects found.") return # sort everything by time if a time value is present - sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj) + if feedparser_obj.entries: + # this feed has posts + sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj.entries) + else: + # this feed does not have posts, but it has a header with channel information + sorted_feed_by_post_time = [feedparser_obj.feed] # add additional tags/images/clean html feedparser_plus_obj = await self._add_to_feedparser_object(sorted_feed_by_post_time[0], url) @@ -219,12 +224,11 @@ class RSS(commands.Cog): except KeyError: pass - # change published_parsed or updated_parsed into a datetime object for embed footers - for time_tag in ["published_parsed", "updated_parsed"]: + # change published_parsed and updated_parsed into a datetime object for embed footers + for time_tag in ["updated_parsed", "published_parsed"]: try: if isinstance(rss_object[time_tag], time.struct_time): rss_object[f"{time_tag}_datetime"] = datetime.datetime(*rss_object[time_tag][:6]) - break except KeyError: pass @@ -286,6 +290,11 @@ class RSS(commands.Cog): def _get_channel_object(self, channel_id: int): """Helper for rss feed loop.""" channel = self.bot.get_channel(channel_id) + if not channel: + try: + channel = self.bot.fetch_channel(channel_id) + except (discord.errors.Forbidden, discord.errors.NotFound): + return None if channel and channel.permissions_for(channel.guild.me).send_messages: return channel return None @@ -330,30 +339,43 @@ class RSS(commands.Cog): async with aiohttp.ClientSession(headers=self._headers, timeout=timeout) as session: async with session.get(url) as resp: html = await resp.read() - return html + return html, None except aiohttp.client_exceptions.ClientConnectorError: - log.error(f"aiohttp failure accessing feed at url:\n\t{url}", exc_info=True) - return None + friendly_msg = "There was an OSError or the connection failed." + msg = f"aiohttp failure accessing feed at url:\n\t{url}" + log.error(msg, exc_info=True) + return None, friendly_msg + except aiohttp.client_exceptions.ClientPayloadError as e: + friendly_msg = "The website closed the connection prematurely or the response was malformed.\n" + friendly_msg += f"The error returned was: `{str(e)}`\n" + friendly_msg += "For more technical information, check your bot's console or logs." + msg = f"content error while reading feed at url:\n\t{url}" + log.error(msg, exc_info=True) + return None, friendly_msg except asyncio.exceptions.TimeoutError: - log.error(f"asyncio timeout while accessing feed at url:\n\t{url}") - return None + friendly_msg = "The bot timed out while trying to access that content." + msg = f"asyncio timeout while accessing feed at url:\n\t{url}" + log.error(msg, exc_info=True) + return None, friendly_msg except Exception: - log.error(f"General failure accessing feed at url:\n\t{url}", exc_info=True) - return None + friendly_msg = "There was an unexpected error. Check your console for more information." + msg = f"General failure accessing feed at url:\n\t{url}" + log.error(msg, exc_info=True) + return None, friendly_msg async def _fetch_feedparser_object(self, url: str): - """Get all feedparser entries from a url.""" - html = await self._get_url_content(url) + """Get a full feedparser object from a url: channel header + items.""" + html, error_msg = await self._get_url_content(url) if not html: - return None + return SimpleNamespace(entries=None, error=error_msg, url=url) feedparser_obj = feedparser.parse(html) - if feedparser_obj.bozo: - log.debug(f"Feed at {url} is bad or took too long to respond.") - return None + error_msg = f"Bozo feed: feedparser is unable to parse the response from {url}.\n" + error_msg += f"Feedparser error message: `{feedparser_obj.bozo_exception}`" + return SimpleNamespace(entries=None, error=error_msg, url=url) - return feedparser_obj.entries + return feedparser_obj async def _add_to_feedparser_object(self, feedparser_obj: feedparser.util.FeedParserDict, url: str): """ @@ -390,9 +412,10 @@ class RSS(commands.Cog): return rss_object async def _sort_by_post_time(self, feedparser_obj: feedparser.util.FeedParserDict): - for time_tag in ["published_parsed", "updated_parsed"]: + for time_tag in ["updated_parsed", "published_parsed"]: try: - sorted_feed_by_post_time = sorted(feedparser_obj, key=lambda x: x.get(time_tag), reverse=True) + baseline_time = time.struct_time((2021, 1, 1, 12, 0, 0, 4, 1, -1)) + sorted_feed_by_post_time = sorted(feedparser_obj, key=lambda x: x.get(time_tag, baseline_time), reverse=True) break except TypeError: sorted_feed_by_post_time = feedparser_obj @@ -401,9 +424,9 @@ class RSS(commands.Cog): async def _time_tag_validation(self, entry: feedparser.util.FeedParserDict): """Gets a unix timestamp if it's available from a single feedparser post entry.""" - entry_time = entry.get("published_parsed", None) + entry_time = entry.get("updated_parsed", None) if not entry_time: - entry_time = entry.get("updated_parsed", None) + entry_time = entry.get("published_parsed", None) if isinstance(entry_time, time.struct_time): entry_time = time.mktime(entry_time) if entry_time: @@ -446,14 +469,17 @@ class RSS(commands.Cog): if all([result.scheme, result.netloc, result.path]): if feed_check: - text = await self._get_url_content(url) + text, error_msg = await self._get_url_content(url) if not text: - log.debug(f"no text from _get_url_content: {url}") + raise NoFeedContent(error_msg) return False rss = feedparser.parse(text) if rss.bozo: - log.debug(f"bozo feed at {url}") + msg = f"Bozo feed: feedparser is unable to parse the response from {url}.\n\n" + msg += "Received content preview:\n" + msg += box(rss.feed.get("summary", str(rss))[:1500]) + raise NoFeedContent(msg) return False else: return True @@ -507,7 +533,12 @@ class RSS(commands.Cog): return async with ctx.typing(): - valid_url = await self._valid_url(url) + try: + valid_url = await self._valid_url(url) + except NoFeedContent as e: + await ctx.send(str(e)) + return + if valid_url: await self._add_feed(ctx, feed_name.lower(), channel, url) else: @@ -706,9 +737,9 @@ class RSS(commands.Cog): url_scheme = url_parse.scheme feed_url_types = ["application/rss+xml", "application/atom+xml", "text/xml", "application/rdf+xml"] for feed_type in feed_url_types: - possible_feeds = soup.find_all('link', rel='alternate', type=feed_type, href=True) + possible_feeds = soup.find_all("link", rel="alternate", type=feed_type, href=True) for feed in possible_feeds: - feed_url = feed.get('href', None) + feed_url = feed.get("href", None) ls_feed_url = feed_url.lstrip("/") if not feed_url: continue @@ -827,10 +858,16 @@ class RSS(commands.Cog): """Helper function for rss listtags.""" msg = f"[ Available Tags for {feed_name} ]\n\n\t" feedparser_obj = await self._fetch_feedparser_object(rss_feed["url"]) + if not feedparser_obj: - await ctx.send("Couldn't fetch that feed for some reason.") + await ctx.send("Couldn't fetch that feed.") return - feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj[0], rss_feed["url"]) + if feedparser_obj.entries: + # this feed has posts + feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.entries[0], rss_feed["url"]) + else: + # this feed does not have posts, but it has a header with channel information + feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.feed, rss_feed["url"]) for tag_name, tag_content in sorted(feedparser_plus_obj.items()): if tag_name in INTERNAL_TAGS: @@ -1041,11 +1078,21 @@ class RSS(commands.Cog): feedparser_obj = await self._fetch_feedparser_object(url) if not feedparser_obj: return + try: + log.debug(f"{feedparser_obj.error} Channel: {channel.id}") + return + except AttributeError: + pass - # sorting the entire feedparser object by published_parsed time if it exists, if not then updated_parsed + # sorting the entire feedparser object by updated_parsed time if it exists, if not then published_parsed # certain feeds can be rearranged by a user, causing all posts to be out of sequential post order # or some feeds are out of time order by default - sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj) + if feedparser_obj.entries: + # this feed has posts + sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj.entries) + else: + # this feed does not have posts, but it has a header with channel information + sorted_feed_by_post_time = [feedparser_obj.feed] if not force: entry_time = await self._time_tag_validation(sorted_feed_by_post_time[0]) @@ -1058,7 +1105,7 @@ class RSS(commands.Cog): feedparser_plus_objects = [] for entry in sorted_feed_by_post_time: - # find the published_parsed (checked first) or an updatated_parsed tag if they are present + # find the updated_parsed (checked first) or an published_parsed tag if they are present entry_time = await self._time_tag_validation(entry) # we only need one feed entry if this is from rss force @@ -1067,7 +1114,18 @@ class RSS(commands.Cog): feedparser_plus_objects.append(feedparser_plus_obj) break - # if this feed has a published_parsed or an updatated_parsed tag, it will use + # now that we are sorting by/saving updated_parsed instead of published_parsed (rss 1.4.0+) + # we can post an update for a post that already exists and has already been posted. + # this will only work for rss sites that are single-use like cloudflare status, discord status, etc + # where an update on the last post should be posted + # for checking every post in every feed for updated posts, each entry in an rss feed would need + # to be saved instead of just the last one. + elif (last_title == entry.title) and (last_link == entry.link) and (entry_time > last_time): + log.debug(f"New update found for an existing post in {name} on cid {channel.id}") + feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) + feedparser_plus_objects.append(feedparser_plus_obj) + + # if this feed has a published_parsed or an updated_parsed tag, it will use # that time value present in entry_time to verify that the post is new. elif (entry_time and last_time) is not None: if (last_title != entry.title) and (last_link != entry.link) and (last_time < entry_time): @@ -1223,7 +1281,7 @@ class RSS(commands.Cog): embed_list.append(embed) # Add published timestamp to the last footer if it exists - time_tags = ["published_parsed_datetime", "updated_parsed_datetime"] + time_tags = ["updated_parsed_datetime", "published_parsed_datetime"] for time_tag in time_tags: try: published_time = feedparser_plus_obj[time_tag] @@ -1324,7 +1382,7 @@ class RSS(commands.Cog): channel = self._get_channel_object(channel_id) if not channel: log.info( - f"Response channel {channel_id} not found or no perms to send messages, removing channel from config" + f"Response channel {channel_id} not found, forbidden to access, or no perms to send messages, removing channel from config" ) await self.config.channel_from_id(int(channel_id)).clear() # Remove entries from dead channel continue @@ -1351,3 +1409,11 @@ class RSS(commands.Cog): except asyncio.queues.QueueEmpty: return None return to_check + + +class NoFeedContent(Exception): + def __init__(self, m): + self.message = m + + def __str__(self): + return self.message