[RSS] Provisional list unpacking

2020-10-01 11:41:57 -07:00
parent ac9405a2b5
commit 6fdb24a7c4
2 changed files with 36 additions and 4 deletions
--- a/rss/info.json
+++ b/rss/info.json
@@ -5,6 +5,6 @@
    "description": "Read RSS feeds",
    "tags": ["rss"],
    "permissions": ["embed_links"],
-    "requirements": ["bs4", "feedparser>=6.0.0", "scipy", "webcolors==1.3"],
+    "requirements": ["bs4>=4.9.1", "feedparser>=6.0.0", "scipy", "webcolors==1.3"],
    "min_bot_version" : "3.4.0"
 }
--- a/rss/rss.py
+++ b/rss/rss.py
@@ -1,6 +1,7 @@
 import asyncio
 import aiohttp
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
+import contextlib
 import copy
 import datetime
 import discord
@@ -25,7 +26,7 @@ from .tag_type import INTERNAL_TAGS, VALID_IMAGES, TagType
 log = logging.getLogger("red.aikaterna.rss")


-__version__ = "1.1.15"
+__version__ = "1.1.16"


 class RSS(commands.Cog):
@@ -121,7 +122,6 @@ class RSS(commands.Cog):
            tag_content_check = await self._get_tag_content_type(tag_content)

            if tag_content_check == TagType.HTML:
-
                # this is a tag that is only html content
                try:
                    soup = BeautifulSoup(tag_content, "html.parser")
@@ -144,6 +144,38 @@ class RSS(commands.Cog):

                rss_object[f"{tag_name}_plaintext"] = self._add_generic_html_plaintext(soup)

+            if tag_content_check == TagType.LIST:
+                for list_item in tag_content:
+                    list_item_check = await self._get_tag_content_type(list_item)
+
+                    # for common "links" format or when "content" is a list
+                    list_html_content_counter = 0
+                    if list_item_check == TagType.HTML:
+                        list_tags = ["value", "href"]
+                        for tag in list_tags:
+                            try:
+                                with contextlib.suppress(MarkupResemblesLocatorWarning):
+                                    soup = BeautifulSoup(list_item, "html.parser")
+                                list_html_content_counter += 1
+                                name = f"{tag_name}_plaintext{str(list_html_content_counter).zfill(2)}"
+                                rss_object[name] = self._add_generic_html_plaintext(soup)
+                                rss_object["is_special"].append(name)
+                            except (KeyError, TypeError):
+                                pass
+
+                    # common "author" tag format
+                    list_dict_content_counter = 0
+                    if list_item_check == TagType.DICT:
+                        list_tags = ["name"]
+                        for tag in list_tags:
+                            try:
+                                list_dict_content_counter += 1
+                                name = f"{tag_name}_plaintext{str(list_dict_content_counter).zfill(2)}"
+                                rss_object[name] = list_item[tag]
+                                rss_object["is_special"].append(name)
+                            except (KeyError, TypeError):
+                                pass
+
        # if media_thumbnail or media_content exists, return the first friendly url
        try:
            rss_object["media_content_plaintext"] = rss_object["media_content"][0]["url"]