[RSS] User facing improvments

* More descriptive error messages for users
* Now will post updates of last post sent
* Handles feeds that do not have entries
* Better channel verification for removing dead feeds
This commit is contained in:
aikaterna
2021-03-03 09:20:40 -08:00
committed by GitHub
parent c3f5dc0feb
commit 2b3ab743ee

View File

@@ -25,7 +25,7 @@ from .tag_type import INTERNAL_TAGS, VALID_IMAGES, TagType
log = logging.getLogger("red.aikaterna.rss")
__version__ = "1.3.8"
__version__ = "1.4.0"
class RSS(commands.Cog):
@@ -42,7 +42,7 @@ class RSS(commands.Cog):
self._read_feeds_loop = None
self._headers = {'User-Agent': 'Python/3.8'}
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'}
def initialize(self):
self._read_feeds_loop = self.bot.loop.create_task(self.read_feeds())
@@ -73,11 +73,16 @@ class RSS(commands.Cog):
if not rss_exists:
feedparser_obj = await self._fetch_feedparser_object(url)
if not feedparser_obj:
await ctx.send("Couldn't fetch that feed for some reason.")
await ctx.send("Couldn't fetch that feed: there were no feed objects found.")
return
# sort everything by time if a time value is present
sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj)
if feedparser_obj.entries:
# this feed has posts
sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj.entries)
else:
# this feed does not have posts, but it has a header with channel information
sorted_feed_by_post_time = [feedparser_obj.feed]
# add additional tags/images/clean html
feedparser_plus_obj = await self._add_to_feedparser_object(sorted_feed_by_post_time[0], url)
@@ -219,12 +224,11 @@ class RSS(commands.Cog):
except KeyError:
pass
# change published_parsed or updated_parsed into a datetime object for embed footers
for time_tag in ["published_parsed", "updated_parsed"]:
# change published_parsed and updated_parsed into a datetime object for embed footers
for time_tag in ["updated_parsed", "published_parsed"]:
try:
if isinstance(rss_object[time_tag], time.struct_time):
rss_object[f"{time_tag}_datetime"] = datetime.datetime(*rss_object[time_tag][:6])
break
except KeyError:
pass
@@ -286,6 +290,11 @@ class RSS(commands.Cog):
def _get_channel_object(self, channel_id: int):
"""Helper for rss feed loop."""
channel = self.bot.get_channel(channel_id)
if not channel:
try:
channel = self.bot.fetch_channel(channel_id)
except (discord.errors.Forbidden, discord.errors.NotFound):
return None
if channel and channel.permissions_for(channel.guild.me).send_messages:
return channel
return None
@@ -330,30 +339,43 @@ class RSS(commands.Cog):
async with aiohttp.ClientSession(headers=self._headers, timeout=timeout) as session:
async with session.get(url) as resp:
html = await resp.read()
return html
return html, None
except aiohttp.client_exceptions.ClientConnectorError:
log.error(f"aiohttp failure accessing feed at url:\n\t{url}", exc_info=True)
return None
friendly_msg = "There was an OSError or the connection failed."
msg = f"aiohttp failure accessing feed at url:\n\t{url}"
log.error(msg, exc_info=True)
return None, friendly_msg
except aiohttp.client_exceptions.ClientPayloadError as e:
friendly_msg = "The website closed the connection prematurely or the response was malformed.\n"
friendly_msg += f"The error returned was: `{str(e)}`\n"
friendly_msg += "For more technical information, check your bot's console or logs."
msg = f"content error while reading feed at url:\n\t{url}"
log.error(msg, exc_info=True)
return None, friendly_msg
except asyncio.exceptions.TimeoutError:
log.error(f"asyncio timeout while accessing feed at url:\n\t{url}")
return None
friendly_msg = "The bot timed out while trying to access that content."
msg = f"asyncio timeout while accessing feed at url:\n\t{url}"
log.error(msg, exc_info=True)
return None, friendly_msg
except Exception:
log.error(f"General failure accessing feed at url:\n\t{url}", exc_info=True)
return None
friendly_msg = "There was an unexpected error. Check your console for more information."
msg = f"General failure accessing feed at url:\n\t{url}"
log.error(msg, exc_info=True)
return None, friendly_msg
async def _fetch_feedparser_object(self, url: str):
"""Get all feedparser entries from a url."""
html = await self._get_url_content(url)
"""Get a full feedparser object from a url: channel header + items."""
html, error_msg = await self._get_url_content(url)
if not html:
return None
return SimpleNamespace(entries=None, error=error_msg, url=url)
feedparser_obj = feedparser.parse(html)
if feedparser_obj.bozo:
log.debug(f"Feed at {url} is bad or took too long to respond.")
return None
error_msg = f"Bozo feed: feedparser is unable to parse the response from {url}.\n"
error_msg += f"Feedparser error message: `{feedparser_obj.bozo_exception}`"
return SimpleNamespace(entries=None, error=error_msg, url=url)
return feedparser_obj.entries
return feedparser_obj
async def _add_to_feedparser_object(self, feedparser_obj: feedparser.util.FeedParserDict, url: str):
"""
@@ -390,9 +412,10 @@ class RSS(commands.Cog):
return rss_object
async def _sort_by_post_time(self, feedparser_obj: feedparser.util.FeedParserDict):
for time_tag in ["published_parsed", "updated_parsed"]:
for time_tag in ["updated_parsed", "published_parsed"]:
try:
sorted_feed_by_post_time = sorted(feedparser_obj, key=lambda x: x.get(time_tag), reverse=True)
baseline_time = time.struct_time((2021, 1, 1, 12, 0, 0, 4, 1, -1))
sorted_feed_by_post_time = sorted(feedparser_obj, key=lambda x: x.get(time_tag, baseline_time), reverse=True)
break
except TypeError:
sorted_feed_by_post_time = feedparser_obj
@@ -401,9 +424,9 @@ class RSS(commands.Cog):
async def _time_tag_validation(self, entry: feedparser.util.FeedParserDict):
"""Gets a unix timestamp if it's available from a single feedparser post entry."""
entry_time = entry.get("published_parsed", None)
entry_time = entry.get("updated_parsed", None)
if not entry_time:
entry_time = entry.get("updated_parsed", None)
entry_time = entry.get("published_parsed", None)
if isinstance(entry_time, time.struct_time):
entry_time = time.mktime(entry_time)
if entry_time:
@@ -446,14 +469,17 @@ class RSS(commands.Cog):
if all([result.scheme, result.netloc, result.path]):
if feed_check:
text = await self._get_url_content(url)
text, error_msg = await self._get_url_content(url)
if not text:
log.debug(f"no text from _get_url_content: {url}")
raise NoFeedContent(error_msg)
return False
rss = feedparser.parse(text)
if rss.bozo:
log.debug(f"bozo feed at {url}")
msg = f"Bozo feed: feedparser is unable to parse the response from {url}.\n\n"
msg += "Received content preview:\n"
msg += box(rss.feed.get("summary", str(rss))[:1500])
raise NoFeedContent(msg)
return False
else:
return True
@@ -507,7 +533,12 @@ class RSS(commands.Cog):
return
async with ctx.typing():
valid_url = await self._valid_url(url)
try:
valid_url = await self._valid_url(url)
except NoFeedContent as e:
await ctx.send(str(e))
return
if valid_url:
await self._add_feed(ctx, feed_name.lower(), channel, url)
else:
@@ -706,9 +737,9 @@ class RSS(commands.Cog):
url_scheme = url_parse.scheme
feed_url_types = ["application/rss+xml", "application/atom+xml", "text/xml", "application/rdf+xml"]
for feed_type in feed_url_types:
possible_feeds = soup.find_all('link', rel='alternate', type=feed_type, href=True)
possible_feeds = soup.find_all("link", rel="alternate", type=feed_type, href=True)
for feed in possible_feeds:
feed_url = feed.get('href', None)
feed_url = feed.get("href", None)
ls_feed_url = feed_url.lstrip("/")
if not feed_url:
continue
@@ -827,10 +858,16 @@ class RSS(commands.Cog):
"""Helper function for rss listtags."""
msg = f"[ Available Tags for {feed_name} ]\n\n\t"
feedparser_obj = await self._fetch_feedparser_object(rss_feed["url"])
if not feedparser_obj:
await ctx.send("Couldn't fetch that feed for some reason.")
await ctx.send("Couldn't fetch that feed.")
return
feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj[0], rss_feed["url"])
if feedparser_obj.entries:
# this feed has posts
feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.entries[0], rss_feed["url"])
else:
# this feed does not have posts, but it has a header with channel information
feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.feed, rss_feed["url"])
for tag_name, tag_content in sorted(feedparser_plus_obj.items()):
if tag_name in INTERNAL_TAGS:
@@ -1041,11 +1078,21 @@ class RSS(commands.Cog):
feedparser_obj = await self._fetch_feedparser_object(url)
if not feedparser_obj:
return
try:
log.debug(f"{feedparser_obj.error} Channel: {channel.id}")
return
except AttributeError:
pass
# sorting the entire feedparser object by published_parsed time if it exists, if not then updated_parsed
# sorting the entire feedparser object by updated_parsed time if it exists, if not then published_parsed
# certain feeds can be rearranged by a user, causing all posts to be out of sequential post order
# or some feeds are out of time order by default
sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj)
if feedparser_obj.entries:
# this feed has posts
sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj.entries)
else:
# this feed does not have posts, but it has a header with channel information
sorted_feed_by_post_time = [feedparser_obj.feed]
if not force:
entry_time = await self._time_tag_validation(sorted_feed_by_post_time[0])
@@ -1058,7 +1105,7 @@ class RSS(commands.Cog):
feedparser_plus_objects = []
for entry in sorted_feed_by_post_time:
# find the published_parsed (checked first) or an updatated_parsed tag if they are present
# find the updated_parsed (checked first) or an published_parsed tag if they are present
entry_time = await self._time_tag_validation(entry)
# we only need one feed entry if this is from rss force
@@ -1067,7 +1114,18 @@ class RSS(commands.Cog):
feedparser_plus_objects.append(feedparser_plus_obj)
break
# if this feed has a published_parsed or an updatated_parsed tag, it will use
# now that we are sorting by/saving updated_parsed instead of published_parsed (rss 1.4.0+)
# we can post an update for a post that already exists and has already been posted.
# this will only work for rss sites that are single-use like cloudflare status, discord status, etc
# where an update on the last post should be posted
# for checking every post in every feed for updated posts, each entry in an rss feed would need
# to be saved instead of just the last one.
elif (last_title == entry.title) and (last_link == entry.link) and (entry_time > last_time):
log.debug(f"New update found for an existing post in {name} on cid {channel.id}")
feedparser_plus_obj = await self._add_to_feedparser_object(entry, url)
feedparser_plus_objects.append(feedparser_plus_obj)
# if this feed has a published_parsed or an updated_parsed tag, it will use
# that time value present in entry_time to verify that the post is new.
elif (entry_time and last_time) is not None:
if (last_title != entry.title) and (last_link != entry.link) and (last_time < entry_time):
@@ -1223,7 +1281,7 @@ class RSS(commands.Cog):
embed_list.append(embed)
# Add published timestamp to the last footer if it exists
time_tags = ["published_parsed_datetime", "updated_parsed_datetime"]
time_tags = ["updated_parsed_datetime", "published_parsed_datetime"]
for time_tag in time_tags:
try:
published_time = feedparser_plus_obj[time_tag]
@@ -1324,7 +1382,7 @@ class RSS(commands.Cog):
channel = self._get_channel_object(channel_id)
if not channel:
log.info(
f"Response channel {channel_id} not found or no perms to send messages, removing channel from config"
f"Response channel {channel_id} not found, forbidden to access, or no perms to send messages, removing channel from config"
)
await self.config.channel_from_id(int(channel_id)).clear() # Remove entries from dead channel
continue
@@ -1351,3 +1409,11 @@ class RSS(commands.Cog):
except asyncio.queues.QueueEmpty:
return None
return to_check
class NoFeedContent(Exception):
def __init__(self, m):
self.message = m
def __str__(self):
return self.message