Skip to content

Commit

Permalink
Merge pull request #2012 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Version 3.2.8
  • Loading branch information
dipu-bd authored Jul 21, 2023
2 parents 74fc656 + 2a597c7 commit afdeb19
Show file tree
Hide file tree
Showing 16 changed files with 779 additions and 557 deletions.
4 changes: 3 additions & 1 deletion .github/contribs.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,7 @@
"dev ops": null,
"[email protected]": null,
"Anuj2976": null,
"[email protected]": null
"[email protected]": null,
"Seven0492": null,
"[email protected]": null
}
759 changes: 407 additions & 352 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.2.7
3.2.8
14 changes: 7 additions & 7 deletions lncrawl/binders/calibre.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def run_ebook_convert(*args):
isdebug = os.getenv("debug_mode")
with open(os.devnull, "w", encoding="utf8") as dumper:
subprocess.call(
[EBOOK_CONVERT] + list(args),
args=[EBOOK_CONVERT] + list(args),
stdout=None if isdebug else dumper,
stderr=None if isdebug else dumper,
)
Expand Down Expand Up @@ -56,12 +56,12 @@ def epub_to_calibre(app, epub_file, out_fmt):
file_name_without_ext,
"--authors",
app.crawler.novel_author,
'--comments',
"--comments",
app.crawler.novel_synopsis,
'--language',
app.crawler.language,
'--tags',
app.crawler.novel_tags,
"--language",
app.crawler.novel_language,
"--tags",
",".join(app.crawler.novel_tags),
"--series",
app.crawler.novel_title,
"--publisher",
Expand All @@ -86,7 +86,7 @@ def epub_to_calibre(app, epub_file, out_fmt):
run_ebook_convert(*args)

if os.path.exists(out_file):
print("Created: %s" % out_file_name)
logger.info("Created: %s" % out_file_name)
return out_file
else:
logger.error("[%s] conversion failed: %s", out_fmt, epub_file_name)
Expand Down
48 changes: 26 additions & 22 deletions lncrawl/templates/novelmtl.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,28 +57,32 @@ def parse_authors(self, soup: BeautifulSoup):
yield a.text.strip()

def select_chapter_tags(self, soup: BeautifulSoup):
last_page = soup.select("#chapters .pagination li a")[-1]["href"]
last_page_qs = parse_qs(urlparse(last_page).query)
max_page = int(last_page_qs["page"][0])
wjm = last_page_qs["wjm"][0]

futures: List[Future] = []
for i in range(max_page + 1):
payload = {
"page": i,
"wjm": wjm,
"_": self.cur_time,
"X-Requested-With": "XMLHttpRequest",
}
url = f"{self.home_url}e/extend/fy.php?{urlencode(payload)}"
f = self.executor.submit(self.get_soup, url)
futures.append(f)

self.resolve_futures(futures, desc="TOC", unit="page")
for i, future in enumerate(futures):
if not future.done():
raise LNException(f"Failed to get page {i + 1}")
soup = future.result()
tag = soup.select("#chapters .pagination li a")
if tag:
last_page = tag[-1]["href"]
last_page_qs = parse_qs(urlparse(last_page).query)
max_page = int(last_page_qs["page"][0])
wjm = last_page_qs["wjm"][0]

futures: List[Future] = []
for i in range(max_page + 1):
payload = {
"page": i,
"wjm": wjm,
"_": self.cur_time,
"X-Requested-With": "XMLHttpRequest",
}
url = f"{self.home_url}e/extend/fy.php?{urlencode(payload)}"
f = self.executor.submit(self.get_soup, url)
futures.append(f)

self.resolve_futures(futures, desc="TOC", unit="page")
for i, future in enumerate(futures):
if not future.done():
raise LNException(f"Failed to get page {i + 1}")
soup = future.result()
yield from soup.select("ul.chapter-list li a")
else:
yield from soup.select("ul.chapter-list li a")

def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
Expand Down
4 changes: 2 additions & 2 deletions lncrawl/webdriver/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def create_local(
# options.add_argument("--disable-dev-shm-usage")

# Add capabilities
options.set_capability("quietExceptions", True)
#options.set_capability("quietExceptions", True)
options.set_capability("acceptInsecureCerts", True)
options.set_capability("useAutomationExtension", False)
#options.set_capability("useAutomationExtension", False)

# Configure window behavior
if headless:
Expand Down
4 changes: 2 additions & 2 deletions lncrawl/webdriver/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ def create_remote(
options.add_argument("--no-first-run")

# Add capabilities
options.set_capability("quietExceptions", True)
#options.set_capability("quietExceptions", True)
options.set_capability("acceptInsecureCerts", True)
options.set_capability("useAutomationExtension", False)
#options.set_capability("useAutomationExtension", False)

# Chrome specific experimental options
options.accept_insecure_certs = True
Expand Down
2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions sources/en/l/lightnovelheaven.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ def read_novel_info(self):
self.novel_author = author[0].text
logger.info("Novel author: %s", self.novel_author)

self.novel_tags = [tag.text.strip() for tag in soup.select(".genres-content a")]
logger.info("Novel tags: %s", self.novel_tags)

synopsis = soup.select_one(".summary__content")
if synopsis:
for h3 in synopsis.select("h3"):
h3.extract()
self.novel_synopsis = self.cleaner.extract_contents(synopsis)
logger.info("Novel synopsis: %s", self.novel_synopsis)

self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"]
logger.info("Novel id: %s", self.novel_id)

Expand Down
107 changes: 72 additions & 35 deletions sources/en/l/lightnovelme.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,43 @@
# -*- coding: utf-8 -*-
import json
import logging
import re

from urllib.parse import quote

from bs4.element import Tag

from lncrawl.core.crawler import Crawler
from lncrawl.models import Volume
from lncrawl.models import Chapter

logger = logging.getLogger(__name__)

search_url = "https://lightnovels.me/api/search?keyword=%s&index=0&limit=20"
chapter_list_url = "https://lightnovels.me/api/chapters?id=%d&index=1&limit=15000"
search_url = "/api/search?keyword=%s&index=0&limit=20"
chapter_list_url = "/api/chapters?id=%d&index=1&limit=15000"


class LightNovelsLive(Crawler):
base_url = [
"http://lightnovels.me/",
"https://lightnovels.me/",
"http://lightnovels.live/",
"https://lightnovels.live/"
]

class LightnovelMe(Crawler):
base_url = ["https://lightnovels.me/"]
has_manga = False
has_mtl = False

def search_novel(self, query):
data = self.get_json(search_url % quote(query))
url = self.absolute_url(search_url % quote(query))
data = self.get_json(url)

results = []
for item in data["results"]:
results.append(
{
"title": item["novel_name"],
"url": "https://lightnovels.me/novel" + item["novel_slug"],
"url": self.absolute_url("/novel" + item["novel_slug"]),
"info": f"Status: {item['status']} | Latest: {item['chapter_name']}",
}
)
Expand All @@ -33,42 +46,66 @@ def search_novel(self, query):

def read_novel_info(self):
soup = self.get_soup(self.novel_url)
script = soup.select_one("script#__NEXT_DATA__")
assert isinstance(script, Tag)
script = soup.select_one('script#__NEXT_DATA__')
assert isinstance(script, Tag), "No available novel info."

data = json.loads(script.text)

novel_info = data["props"]["pageProps"]["novelInfo"]
novel_id = novel_info["novel_id"]
self.novel_title = novel_info["novel_name"]
self.novel_cover = self.absolute_url(novel_info["novel_image"])
self.novel_author = ", ".join(
[x["name"] for x in data["props"]["pageProps"]["authors"]]
novel_info = data['props']['pageProps']['novelInfo']
novel_id = int(novel_info['novel_id'])

self.novel_title = novel_info['novel_name']
self.novel_cover = self.absolute_url(novel_info['novel_image'])
self.novel_author = ', '.join(
[author['name'] for author in data['props']['pageProps']['authors']]
)

data = self.get_json(chapter_list_url % (novel_id))
# Adds proper spacing in the synopsis. (lossy)
#
# Regex101 link: https://regex101.com/r/lajsXs/3
for paragraph in re.split(r'[.!?](?=\w+)(?!\S+[.!?()]+(\s|\w))', novel_info['novel_description']):
if paragraph is None:
self.novel_synopsis += "<br/><br/>"
continue

for i, item in enumerate(data["results"]):
chap_id = i + 1
vol_id = i // 100 + 1
if i % 100 == 0:
self.volumes.append({"id": vol_id})
self.novel_synopsis += paragraph

if paragraph.endswith('!') | paragraph.endswith('?') | paragraph.endswith('.'):
pass
else:
self.novel_synopsis += "."

self.novel_tags = ', '.join(
[genre['name'] for genre in data['props']['pageProps']['genres']]
)

url = self.absolute_url(chapter_list_url % novel_id)
data = self.get_json(url)

for index, item in enumerate(data['results']):
chap_id = index + 1
vol_id = index // 100 + 1
if index % 100 == 0:
self.volumes.append(
Volume(id=vol_id)
)
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": item["chapter_name"],
"url": self.absolute_url(item["slug"]),
}
Chapter(
id=chap_id,
volume=vol_id,
title=item['chapter_name'],
url=self.absolute_url(item["slug"])
)
)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
script = soup.select_one("script#__NEXT_DATA__")
assert isinstance(script, Tag)
data = json.loads(script.text)
tag = self.get_soup(chapter['url']).select_one(".chapter-content div")

str_chapter = self.cleaner.extract_contents(tag).replace(r"\'", "'").strip()
if str_chapter == "":
print(f" Warning: no contents in chapter {chapter['id']}, {chapter['title']}.")
str_chapter = '<h4>Empty chapter.</h4>' +\
'<p><mark style="color:Green">Hint</mark>: ' +\
'reporting this to your provider <i>might</i> solve the issue.</p>'

chapter_info = data["props"]["pageProps"]["cachedChapterInfo"]
content = str(chapter_info["content"])
content = content.replace("\u003c", "<").replace("\u003e", ">")
content = content.replace("<p>" + chapter_info["chapter_name"] + "</p>", "", 1)
return content
return str_chapter
1 change: 1 addition & 0 deletions sources/en/l/lightnovelreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class LightnovelReader(Crawler):
"https://www.lightnovelreader.me/",
"https://lnreader.org/",
"https://www.lnreader.org/",
"http://readlightnovel.online/"
]

def initialize(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions sources/en/m/mixednovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class MixedNovelNet(Crawler):
has_mtl = True
base_url = [
"https://mixednovel.net/",
"https://earlynovel.net/",
]

def initialize(self):
Expand Down
62 changes: 55 additions & 7 deletions sources/en/n/novelhall.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,84 @@
# -*- coding: utf-8 -*-
import logging
from lncrawl.core.crawler import Crawler
from urllib.parse import quote_plus

logger = logging.getLogger(__name__)

search_url = "/index.php?s=so&module=book&keyword="

class NovelhallCrawler(Crawler):

class NovelHallCrawler(Crawler):
base_url = [
"https://www.novelhall.com/",
"http://www.novelhall.com/",
"https://novelhall.com/",
"http://novelhall.com/",
]

has_manga = False
has_mtl = True
base_url = "https://www.novelhall.com/"

def search_novel(self, query: str):
soup = self.get_soup(self.absolute_url(search_url + quote_plus(query.lower())))

results = []
for novel in soup.select('.section3 table tbody tr'):
novel = novel.findAll('a')
novel_link = novel[1]
latest_chapter = novel[2].text.strip().split('.')
chapter_number = latest_chapter[0]

if chapter_number.isdigit():
latest_chapter = "Chapter %s: %s" % (chapter_number, latest_chapter[1])
else:
latest_chapter = "Latest chapter: " + latest_chapter[0]

results.append(
{
"title": novel_link.text.strip(),
"url": self.absolute_url(novel_link['href']),
"info": latest_chapter
}
)

return results

def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one(".book-info h1")
assert possible_title, "No novel title"
self.novel_title = possible_title.text
if soup is None:
raise LookupError("novel url is invalid.")

book_info = soup.select_one("div.book-info")

self.novel_title = book_info.h1.text
assert self.novel_title, "no novel title"
logger.info("Novel title: %s", self.novel_title)

possible_image = soup.select_one("div.book-img img")
if possible_image:
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)

if possible_image['src'] == "":
logger.warning("Novel cover: unavailable")
else:
logger.info("Novel cover: %s", self.novel_cover)
else:
logger.info("Novel cover: unavailable")

author = soup.select("div.book-info div.total.booktag span.blue")[0]
author.select_one("p").extract()
self.novel_author = author.text.strip()
logger.info("Novel author: %s", self.novel_author)

self.novel_tags = [soup.select_one("div.book-info div.total.booktag a.red").text.strip()]
logger.info("Novel tags: %s", self.novel_tags)

for a in soup.select("div#morelist.book-catalog ul li a"):
chap_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
vol_id = len(self.chapters) // 100 + 1
if len(self.volumes) < vol_id:
self.volumes.append({"id": vol_id})
self.chapters.append(
Expand Down
Loading

0 comments on commit afdeb19

Please sign in to comment.