Skip to content

Commit

Permalink
Merge pull request #1699 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Version 3.0.1
  • Loading branch information
dipu-bd authored Oct 9, 2022
2 parents e5e1bc6 + b247a39 commit 1fe9346
Show file tree
Hide file tree
Showing 15 changed files with 490 additions and 422 deletions.
666 changes: 333 additions & 333 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.0.0
3.0.1
2 changes: 1 addition & 1 deletion lncrawl/core/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def wait(
timeout: Optional[float] = 60,
poll_frequency: Optional[float] = 0.5,
ignored_exceptions: Iterable[Exception] = [],
expected_conditon=EC.visibility_of_all_elements_located,
expected_conditon=EC.presence_of_element_located,
):
"""Waits for a element to be visible on the current page by CSS selector.
Expand Down
8 changes: 3 additions & 5 deletions lncrawl/core/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,11 @@ def _fetch_cover_image(app):
cover_file,
)
except Exception as e:
logger.exception("Failed to download cover", e)
if logger.isEnabledFor(logging.DEBUG):
logger.exception("Failed to download cover", e)

if not os.path.isfile(cover_file):
try:
generate_cover_image(cover_file)
except Exception as e:
logger.exception("Failed to generate cover", e)
generate_cover_image(cover_file)

app.progress += 1
app.book_cover = cover_file
Expand Down
2 changes: 2 additions & 0 deletions lncrawl/core/exeptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from urllib.error import URLError

from cloudscraper.exceptions import CloudflareException
from PIL import UnidentifiedImageError
from requests.exceptions import RequestException
from urllib3.exceptions import HTTPError

Expand All @@ -19,4 +20,5 @@ class FallbackToBrowser(Exception):
CloudflareException,
RequestException,
FallbackToBrowser,
UnidentifiedImageError,
)
24 changes: 12 additions & 12 deletions lncrawl/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import ssl
from io import BytesIO
from typing import Any, Dict, Optional, Union
from urllib.parse import ParseResult, urlparse
from urllib.parse import ParseResult, quote, urlparse

from bs4 import BeautifulSoup
from cloudscraper import CloudScraper, User_Agent
Expand Down Expand Up @@ -80,11 +80,11 @@ def __process_request(self, method: str, url, **kwargs):
kwargs["proxies"] = self.__get_proxies(_parsed.scheme)
headers = kwargs.pop("headers", {})
headers = CaseInsensitiveDict(headers)
headers.setdefault("Host", _parsed.hostname)
# headers.setdefault("Host", _parsed.hostname)
headers.setdefault("Origin", self.home_url.strip("/"))
headers.setdefault("Referer", self.last_soup_url.strip("/"))
headers.setdefault("Referer", self.last_soup_url or self.home_url)
headers.setdefault("User-Agent", self.user_agent)
kwargs["headers"] = headers
kwargs["headers"] = {quote(k): quote(v) for k, v in headers.items() if v}

while retry >= 0:
try:
Expand All @@ -96,9 +96,9 @@ def __process_request(self, method: str, url, **kwargs):
with self.domain_gate(_parsed.hostname):
with no_ssl_verification():
response: Response = method_call(url, **kwargs)
response.encoding = "utf8"

response.raise_for_status()
response.encoding = "utf8"
self.cookies.update({x.name: x.value for x in response.cookies})
return response
except ScraperErrorGroup as e:
Expand All @@ -108,9 +108,6 @@ def __process_request(self, method: str, url, **kwargs):
retry -= 1
logger.debug(f"{type(e).__qualname__}: {e} | Retrying...", e)

self.change_user_agent()
kwargs["headers"].setdefault("User-Agent", self.user_agent)

if isinstance(e, ProxyError):
for proxy_url in kwargs.get("proxies", {}).values():
remove_faulty_proxies(proxy_url)
Expand Down Expand Up @@ -237,10 +234,12 @@ def download_image(self, url: str, headers={}, **kwargs) -> Image:
content = base64.b64decode(url.split("base64,")[-1])
else:
headers = CaseInsensitiveDict(headers)
headers.setdefault(
"Accept",
"image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9",
)
headers.setdefault("Origin", None)
headers.setdefault("Referer", None)
# headers.setdefault(
# "Accept",
# "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.9",
# )
response = self.__process_request("get", url, headers=headers, **kwargs)
content = response.content
return Image.open(BytesIO(content))
Expand All @@ -258,6 +257,7 @@ def get_json(self, url, headers={}, **kwargs) -> Any:
def post_json(self, url, data={}, headers={}) -> Any:
"""Make a POST request and return the content as JSON object"""
headers = CaseInsensitiveDict(headers)
headers.setdefault("Content-Type", "application/json")
headers.setdefault(
"Accept",
"application/json,text/plain,*/*",
Expand Down
7 changes: 4 additions & 3 deletions lncrawl/core/taskman.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,11 @@ def resolve_futures(
except Exception as e:
if isinstance(e, KeyboardInterrupt):
break
message = f"{type(e).__name__}: {e}"
if not bar.disable:
if bar.disable:
logger.exception("Failure to resolve future")
else:
bar.clear()
logger.warning(message)
logger.warning(f"{type(e).__name__}: {e}")
finally:
bar.update()
finally:
Expand Down
2 changes: 1 addition & 1 deletion lncrawl/templates/browser/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def download_image(self, url: str, headers={}, **kwargs) -> Image:
if logger.isEnabledFor(logging.DEBUG):
logger.exception("Failed in download image: %s", e)
self.init_browser()
self.browser.visit(url)
self._browser.visit(url)
self.browser.wait("img", By.TAG_NAME)
png = self.browser.find("img", By.TAG_NAME).screenshot_as_png
return Image.open(BytesIO(png))
Expand Down
1 change: 0 additions & 1 deletion lncrawl/templates/novelmtl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
import time
from concurrent.futures import Future
from typing import List
Expand Down
2 changes: 1 addition & 1 deletion lncrawl/webdriver/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def as_tag(self) -> Tag:
html = self.outer_html()
if not hasattr(self, "_tag") or self._html != html:
self._html = html
self._tag = self._soup_maker.make_tag(self._tag)
self._tag = self._soup_maker.make_tag(html)
return self._tag

def find_all(
Expand Down
2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions sources/en/f/freewebnovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
class FreeWebNovelCrawler(SearchableSoupTemplate, ChapterOnlySoupTemplate):
base_url = ["https://freewebnovel.com/"]

def initialize(self) -> None:
self.cleaner.bad_tags.update(["h4"])

def select_search_items(self, query: str):
data = {"searchkey": query}
soup = self.post_soup(f"{self.home_url}search/", data=data)
Expand Down
7 changes: 5 additions & 2 deletions sources/en/m/mtlnation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@


class MTLNation(Crawler):
base_url = ["https://mtlnation.com/", "https://www.mtlnation.com/"]
base_url = [
"https://mtlnation.com/",
"https://www.mtlnation.com/",
]
has_mtl = True

def initialize(self):
pass
self.init_executor(3)

def login(self, email: str, password: str) -> None:
self.post_json(
Expand Down
44 changes: 14 additions & 30 deletions sources/en/n/novelupdatescc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from urllib.parse import quote

from lncrawl.core.crawler import Crawler
from lncrawl.models.chapter import Chapter

logger = logging.getLogger(__name__)
search_url = "https://www.novelupdates.cc/search/%s/1"
Expand Down Expand Up @@ -38,48 +39,31 @@ def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one("div.book-name")
possible_title = soup.select_one(".book-name")
assert possible_title, "No novel title"
self.novel_title = possible_title.text.strip()
logger.info("Novel title: %s", self.novel_title)

self.novel_author = soup.select_one("div.author span.name").text.strip()
possible_author = soup.select_one(".person-info .author .name")
if possible_author:
self.novel_author = possible_author.text.strip()
logger.info("Novel author: %s", self.novel_author)

possible_image = soup.select_one("div.book-img img")
possible_image = soup.select_one(".book-img img[src]")
if possible_image:
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)

# Extract volume-wise chapter entries
chapters = soup.select("ul.chapter-list a")

for a in chapters:
chap_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
if len(self.volumes) < vol_id:
self.volumes.append({"id": vol_id})
for a in soup.select("ul.chapter-list a[href]"):
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"url": self.absolute_url(a["href"]),
"title": a.select_one("p.chapter-name").text.strip()
or ("Chapter %d" % chap_id),
}
Chapter(
id=len(self.chapters) + 1,
title=a.text.strip(),
url=self.absolute_url(a["href"]),
)
)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])

chapter["title"] = soup.select_one("h1.chapter-title").text.strip()

self.cleaner.bad_text_regex = set(
[
r"^translat(ed by|or)",
r"(volume|chapter) .?\d+",
]
)
body_parts = soup.select_one("div.chapter-entity")

return self.cleaner.extract_contents(body_parts)
content = soup.select_one("#chapter-entity")
return self.cleaner.extract_contents(content)
Loading

0 comments on commit 1fe9346

Please sign in to comment.