Update ddxs domain name and bad text

dipu-bd · Sep 5, 2024 · 80e6d82 · 80e6d82
1 parent 567076b
commit 80e6d82
Showing 1 changed file with 18 additions and 6 deletions.
diff --git a/sources/zh/ddxsss.py b/sources/zh/ddxsss.py
@@ -13,13 +13,12 @@
 class DdxSss(Crawler):
     base_url = [
         "https://www.ddxss.cc/",
-    ]
-    # custom banned text as it's all loose and the cleaner deletes the whole chapter if used in bad_text_*
-    banned_text = [
-        "请收藏本站：https://www.ddxsss.com。顶点小说手机版：https://m.ddxsss.com",
+        "https://www.ddtxt8.cc/",
     ]
 
     def initialize(self):
+        self.init_executor(ratelimit=20)
+
         # the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
         self.init_parser("html.parser")
         self.cleaner.bad_tags.update(["script", "a"])
@@ -28,6 +27,19 @@ def initialize(self):
             "div.Readpage.pagedown",
         ])
 
+        # p tags should only show up after being parsed and formatted the first time
+        self.cleaner.bad_tag_text_pairs["p"] = [
+            "请收藏本站：",
+            "顶点小说手机版：",
+            "您可以在百度里搜索",
+            "最新章节地址：",
+            "全文阅读地址：",
+            "txt下载地址：",
+            "手机阅读：",
+            '为了方便下次阅读，你可以点击下方的"收藏"记录本次',
+            "请向你的朋友（QQ、博客、微信等方式）推荐本书，谢谢您的支持！！",
+        ]
+
     def search_novel(self, query):
         data = self.get_json(
             f"{self.home_url}user/search.html?q={query}",
@@ -105,9 +117,9 @@ def download_chapter_body(self, chapter):
         soup = self.get_soup(chapter.url, encoding="utf-8")
         contents = soup.select_one("div#chaptercontent")
         text = self.cleaner.extract_contents(contents)
-        for bad_text in self.banned_text:
-            text = text.replace(bad_text, "")
         # chapter title is usually present but without space between chapter X and the title
         text = text.replace(chapter.title, "")
         text = text.replace(chapter.title.replace(" ", ""), "")
+        # remove paragraphs with bad text after parsing linebreaks
+        text = self.cleaner.extract_contents(self.make_soup(text))
         return text