Skip to content

Commit

Permalink
Merge pull request #1536 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
dipu-bd authored Sep 8, 2022
2 parents 8639f7e + 84d92d0 commit 9894845
Show file tree
Hide file tree
Showing 13 changed files with 589 additions and 352 deletions.
526 changes: 273 additions & 253 deletions README.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ services:
dockerfile: ./scripts/Dockerfile
command: python -m lncrawl --bot discord --shard-id 0 --shard-count 1 --suppress
environment:
CLOUD_DRIVE: "ANONFILES"
CLOUD_DRIVE: "GOFILE"
DISCORD_TOKEN: "${DISCORD_TOKEN}"
DISCORD_SIGNAL_CHAR: "${DISCORD_SIGNAL_CHAR}"
DISCORD_DISABLE_SEARCH: "${DISCORD_DISABLE_SEARCH}"
Expand All @@ -20,6 +20,6 @@ services:
dockerfile: ./scripts/Dockerfile
command: python -m lncrawl --bot telegram
environment:
CLOUD_DRIVE: "ANONFILES"
CLOUD_DRIVE: "GOFILE"
TELEGRAM_TOKEN: "${TELEGRAM_TOKEN}"

11 changes: 9 additions & 2 deletions lncrawl/core/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import shutil
from threading import Thread
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse

Expand All @@ -19,7 +20,6 @@

logger = logging.getLogger(__name__)


class App:
'''Bots are based on top of an instance of this app'''

Expand All @@ -41,6 +41,13 @@ def __init__(self):
atexit.register(self.destroy)
# end def

def __background(self, target_method, *args, **kwargs):
t = Thread(target=target_method, args=args, kwargs=kwargs)
t.start()
while t.is_alive():
t.join(1)


# ----------------------------------------------------------------------- #

def initialize(self):
Expand Down Expand Up @@ -139,7 +146,7 @@ def get_novel_info(self):

print('Retrieving novel info...')
print(self.crawler.novel_url)
self.crawler.read_novel_info()
self.__background(self.crawler.read_novel_info)
print('NOVEL: %s' % self.crawler.novel_title)
print('%d volumes and %d chapters found' %
(len(self.crawler.volumes), len(self.crawler.chapters)))
Expand Down
12 changes: 6 additions & 6 deletions lncrawl/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,12 @@ def __init__(self) -> None:
self.enable_auto_proxy = False
# end def

def __generate_proxy(self, url, **kwargs):
def __generate_proxy(self, url, timeout:int = 0):
if not self.enable_auto_proxy or not url:
return None
# end if
scheme = urlparse(self.home_url).scheme
return { scheme: get_a_proxy(scheme, **kwargs) }
return { scheme: get_a_proxy(scheme, timeout) }
# end def

def __process_request(self, method: str, url, **kwargs):
Expand Down Expand Up @@ -149,7 +149,7 @@ def __process_request(self, method: str, url, **kwargs):
# end for
# end if
if retry != 0: # do not use proxy on last attemp
kwargs['proxies'] = self.__generate_proxy(url, timeout=2)
kwargs['proxies'] = self.__generate_proxy(url, 5)
# end if
# end try
# end while
Expand Down Expand Up @@ -270,7 +270,7 @@ def is_relative_url(self, url) -> bool:

def get_response(self, url, **kwargs) -> Response:
kwargs = kwargs or dict()
kwargs.setdefault('retry', 5)
kwargs.setdefault('retry', 3)
kwargs.setdefault('timeout', (7, 301)) # in seconds

result = self.__process_request('get', url, **kwargs)
Expand All @@ -280,13 +280,13 @@ def get_response(self, url, **kwargs) -> Response:

def post_response(self, url, data={}, headers={}, **kwargs) -> Response:
kwargs = kwargs or dict()
kwargs.setdefault('retry', 2)
kwargs.setdefault('retry', 1)
headers = {k.lower(): v for k, v in headers.items()}
headers.setdefault('content-type', 'application/json')
kwargs['headers'] = headers
kwargs['data'] = data

return self.__process_request('get', url, **kwargs)
return self.__process_request('post', url, **kwargs)
# end def

def submit_form(self, url, data={}, multipart=False, headers={}) -> Response:
Expand Down
67 changes: 59 additions & 8 deletions lncrawl/core/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,17 @@ def extract_chapter_images(app, chapter):
if not img or not img.has_attr('src'):
continue
# end if
full_url = app.crawler.absolute_url(img['src'], page_url=chapter['url'])
full_url = app.crawler.absolute_url(img['src'], page_url=chapter['url'])
if not full_url.startswith('http'):
continue
# end if
filename = hashlib.md5(full_url.encode()).hexdigest() + '.jpg'
img.attrs = {'src': 'images/' + filename, 'alt': filename}
chapter['images'][filename] = full_url
# end for

soup_body = soup.select_one('body')
assert isinstance(soup_body, bs4.Tag), 'Invalid soup body'
assert soup_body
chapter['body'] = ''.join([str(x) for x in soup_body.contents])
# end def

Expand Down Expand Up @@ -243,16 +246,15 @@ def download_cover_image(app):
# end def


def download_content_image(app, url, filename):
def download_content_image(app, url, filename, image_folder):
from .app import App
assert isinstance(app, App)
image_folder = os.path.join(app.output_path, 'images')
image_file = os.path.join(image_folder, filename)
try:
if os.path.isfile(image_file):
return
# end if
img = download_image(app, url)
img = download_image(app, url)
os.makedirs(image_folder, exist_ok=True)
with open(image_file, 'wb') as f:
img.convert('RGB').save(f, "JPEG")
Expand All @@ -268,6 +270,38 @@ def download_content_image(app, url, filename):
# end def


def discard_failed_images(app, chapter, failed):
from .app import App
assert isinstance(app, App)
assert app.crawler is not None
assert isinstance(chapter, dict), 'Invalid chapter'

if not chapter['body'] or not 'images' in chapter:
return
# end if

assert isinstance(chapter['images'], dict)
current_failed = [
filename for filename in failed
if filename in chapter['images']
]
if not current_failed:
return
# end if

soup = app.crawler.make_soup(chapter['body'])
for filename in current_failed:
chapter['images'].pop(filename)
for img in soup.select(f'img[alt="{filename}"]'):
img.extract()
# end for
# end for
soup_body = soup.select_one('body')
assert soup_body
chapter['body'] = ''.join([str(x) for x in soup_body.contents])
# end def


def download_chapter_images(app):
from .app import App
assert isinstance(app, App)
Expand All @@ -281,20 +315,37 @@ def download_chapter_images(app):
app,
)
]

# download content images
image_folder = os.path.join(app.output_path, 'images')
images_to_download = set([
(filename, url)
for chapter in app.chapters
for filename, url in chapter.get('images', {}).items()
])
futures_to_check += [
app.crawler.executor.submit(
download_content_image,
app,
url,
filename,
image_folder
)
for chapter in app.chapters
for filename, url in chapter.get('images', {}).items()
for filename, url in images_to_download
]

failed = []
try:
resolve_all_futures(futures_to_check, desc=' Images', unit='item')
failed = [
filename for filename, url in images_to_download
if not os.path.isfile(os.path.join(image_folder, filename))
]
finally:
logger.info('Processed %d images' % app.progress)
logger.info('Processed %d images [%d failed]' % (app.progress, len(failed)))
# end try

for chapter in app.chapters:
discard_failed_images(app, chapter, failed)
# end for
# end def
56 changes: 35 additions & 21 deletions lncrawl/core/proxy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import atexit
import logging
import random
from typing import Dict, List
import signal
import time
Expand Down Expand Up @@ -89,7 +90,7 @@ def __validate_and_add(scheme: str, ip: str, url: str):
)
resp.raise_for_status()
if resp.text.strip() == ip:
# print('found', url)
# print('>>>>>> found', url)
__proxy_list[scheme].append(url)
return True
# end if
Expand All @@ -100,36 +101,49 @@ def __validate_and_add(scheme: str, ip: str, url: str):
# end def


def __proxy_finder():
def __get_free_proxy_list(url):
with no_ssl_verification():
resp = __sess.get(
url,
headers={'user-agent': user_agents[0]},
timeout=5
)
if resp.status_code >= 400:
return []
# end if
html = resp.content.decode('utf8', 'ignore')
soup = BeautifulSoup(html, 'lxml')
return [
[td.text for td in tr.select('td')]
for tr in soup.select('.fpl-list table tbody tr')
]
# end def


def __find_proxies():
err_count = 0
while err_count < 3 and not __has_exit:
logger.debug('Fetching proxies | Current checklist: %d', len(__proxy_visited_at))
try:
resp = __sess.get(
'https://free-proxy-list.net/',
headers={'user-agent': user_agents[0]},
timeout=5
)
resp.raise_for_status()
html = resp.content.decode('utf8', 'ignore')
soup = BeautifulSoup(html, 'lxml')
rows = __get_free_proxy_list('https://free-proxy-list.net/')
rows += __get_free_proxy_list('https://www.sslproxies.org/')
random.shuffle(rows)
err_count = 0
for tr in soup.select('.fpl-list table tbody tr'):

for cols in rows:
if __has_exit:
break

cols = [td.text for td in tr.select('td')]
if 'hour' in cols[7]:
continue
if cols[4] not in ['anonymous', 'transparent']:
continue
# end if

ip = cols[0]
port = cols[1]
type = cols[4]
port = cols[1]
scheme = 'https' if cols[6] == 'yes' else 'http'
url = f'{scheme}://{ip}:{port}'

if type not in ['anonymous', 'transparent']:
continue
# end if

__proxy_list.setdefault(scheme, [])
if __proxy_visited_at.get(url, 0) + __proxy_ttl < time.time():
__validate_and_add(scheme, ip, url)
Expand All @@ -152,7 +166,7 @@ def __proxy_finder():
def start_proxy_fetcher():
atexit.register(stop_proxy_fetcher)
signal.signal(signal.SIGINT, stop_proxy_fetcher)
Thread(target=__proxy_finder, daemon=False).start()
Thread(target=__find_proxies, daemon=False).start()
# end def


Expand Down
10 changes: 4 additions & 6 deletions lncrawl/utils/uploader/gofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,17 @@


# API Docs: https://gofile.io/api
def upload(file_path, description):
def upload(file_path, description=''):
with Session() as sess:
response = sess.get('https://api.gofile.io/getServer')
response.raise_for_status()
server_name = response.json()['data']['server']

with open(file_path, "rb") as fp:
upload_url = f'https://{server_name}.gofile.io/uploadFile'
response = sess.post(
upload_url,
data={'description': description},
files={ 'upload_file': fp },
f'https://{server_name}.gofile.io/uploadFile',
files={ 'file': fp },
stream=True,
)
response.raise_for_status()
return response.json()['data']['directLink']
return response.json()['data']['downloadPage']
2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

22 changes: 9 additions & 13 deletions sources/en/i/isotls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,43 @@

logger = logging.getLogger(__name__)


class IsotlsCrawler(Crawler):
base_url = 'https://isotls.com/'
base_url = [
'https://isotls.com/',
'https://www.isotls.com/',
]

def read_novel_info(self):
logger.debug('Visiting %s', self.novel_url)
soup = self.get_soup(self.novel_url)

possible_novel_cover = soup.select_one('meta[property="og:image"]')
if possible_novel_cover:
self.novel_cover = self.absolute_url(possible_novel_cover['content'])
logger.info('Novel cover: %s', self.novel_cover)
possible_cover = soup.select_one('meta[property="og:image"]')
if possible_cover:
self.novel_cover = self.absolute_url(possible_cover['content'])

possible_title = soup.select_one('meta[property="og:title"]')
assert possible_title, 'No novel title'
self.novel_title = possible_title['content']
logger.info('Novel title: %s', self.novel_title)

possible_novel_author = soup.select_one('meta[name="twitter:data1"]')
if possible_novel_author:
self.novel_author = possible_novel_author['content']
logger.info('%s', self.novel_author)

for a in soup.select('main section div:nth-child(2) ul li a'):
chap_id = len(self.chapters) + 1
vol_id = len(self.chapters) // 100 + 1
if len(self.chapters) % 100 == 0:
self.volumes.append({'id': vol_id})
# end if

self.chapters.append({
'id': chap_id,
'volume': vol_id,
'title': a.text.strip(),
'url': self.absolute_url(a['href']),
})
# end for
# end def

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter['url'])
contents = soup.select('article p')
body = [str(p) for p in contents if p.text.strip()]
return '<p>' + '</p><p>'.join(body) + '</p>'
# end def
# end class
Loading

0 comments on commit 9894845

Please sign in to comment.