Skip to content

Commit

Permalink
Update ddxs domain name and bad text
Browse files Browse the repository at this point in the history
  • Loading branch information
Zokhoi committed Sep 5, 2024
1 parent 567076b commit 80e6d82
Showing 1 changed file with 18 additions and 6 deletions.
24 changes: 18 additions & 6 deletions sources/zh/ddxsss.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
class DdxSss(Crawler):
base_url = [
"https://www.ddxss.cc/",
]
# custom banned text as it's all loose and the cleaner deletes the whole chapter if used in bad_text_*
banned_text = [
"请收藏本站:https://www.ddxsss.com。顶点小说手机版:https://m.ddxsss.com",
"https://www.ddtxt8.cc/",
]

def initialize(self):
self.init_executor(ratelimit=20)

# the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
self.init_parser("html.parser")
self.cleaner.bad_tags.update(["script", "a"])
Expand All @@ -28,6 +27,19 @@ def initialize(self):
"div.Readpage.pagedown",
])

# p tags should only show up after being parsed and formatted the first time
self.cleaner.bad_tag_text_pairs["p"] = [
"请收藏本站:",
"顶点小说手机版:",
"您可以在百度里搜索",
"最新章节地址:",
"全文阅读地址:",
"txt下载地址:",
"手机阅读:",
'为了方便下次阅读,你可以点击下方的"收藏"记录本次',
"请向你的朋友(QQ、博客、微信等方式)推荐本书,谢谢您的支持!!",
]

def search_novel(self, query):
data = self.get_json(
f"{self.home_url}user/search.html?q={query}",
Expand Down Expand Up @@ -105,9 +117,9 @@ def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url, encoding="utf-8")
contents = soup.select_one("div#chaptercontent")
text = self.cleaner.extract_contents(contents)
for bad_text in self.banned_text:
text = text.replace(bad_text, "")
# chapter title is usually present but without space between chapter X and the title
text = text.replace(chapter.title, "")
text = text.replace(chapter.title.replace(" ", ""), "")
# remove paragraphs with bad text after parsing linebreaks
text = self.cleaner.extract_contents(self.make_soup(text))
return text

0 comments on commit 80e6d82

Please sign in to comment.