-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
55555bd
commit f4cdec0
Showing
10 changed files
with
571,972 additions
and
285,950 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
.env | ||
follow_users.log | ||
/scraper_usernames/.env | ||
fething_new_users.ipynb |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import os | ||
import requests | ||
import dotenv | ||
|
||
|
||
dotenv.load_dotenv() | ||
github_token = os.getenv("GITHUB_TOKEN")# your github token | ||
|
||
|
||
def fething_users_from_github(users_to_fetch=1, token=None) -> list: | ||
scraped_users = [] | ||
|
||
querry = 'language:python repos:>5 followers:>10' | ||
url = "https://api.github.com/search/users" | ||
params = { | ||
'per_page': users_to_fetch, | ||
'since': 0, | ||
'q': querry | ||
|
||
} | ||
headers = { | ||
'Authorization': token | ||
} | ||
|
||
try: | ||
response = requests.get(url, params=params, headers=headers) | ||
response.raise_for_status() | ||
users = response.json().get('items', []) | ||
|
||
for user in users: | ||
scraped_users.append(user['login']) | ||
|
||
except requests.exceptions.HTTPError as e: | ||
print(f"Error: {e}") | ||
except requests.exceptions.RequestException as e: | ||
print(f"Error: {e}") | ||
|
||
return scraped_users |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-10-16T23:58:18.707985Z", | ||
"start_time": "2024-10-16T23:58:18.703958Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"import os\n", | ||
"import requests\n", | ||
"\n", | ||
"github_token = \"#\"## your github token" | ||
], | ||
"id": "b91c72b336bd44c8", | ||
"outputs": [], | ||
"execution_count": 48 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-10-17T01:40:59.610303Z", | ||
"start_time": "2024-10-17T01:40:59.604896Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"def fething_users_from_github(users_to_fetch = 1, token = None) -> list:\n", | ||
" scraped_users = []\n", | ||
" \n", | ||
" querry = 'language:python repos:>5 followers:>10'\n", | ||
" url = \"https://api.github.com/search/users\"\n", | ||
" params = {\n", | ||
" 'per_page' : users_to_fetch,\n", | ||
" 'since' : 0,\n", | ||
" 'q' : querry\n", | ||
" \n", | ||
" }\n", | ||
" headers = {\n", | ||
" 'Authorization' : token\n", | ||
" }\n", | ||
" \n", | ||
" try:\n", | ||
" response = requests.get(url, params=params, headers=headers)\n", | ||
" response.raise_for_status()\n", | ||
" users = response.json().get('items', [])\n", | ||
" \n", | ||
" for user in users:\n", | ||
" scraped_users.append(user['login'])\n", | ||
"\n", | ||
" except requests.exceptions.HTTPError as e:\n", | ||
" print(f\"Error: {e}\")\n", | ||
" except requests.exceptions.RequestException as e:\n", | ||
" print(f\"Error: {e}\")\n", | ||
"\n", | ||
" return scraped_users" | ||
], | ||
"id": "a4f14605767e6f53", | ||
"outputs": [], | ||
"execution_count": 77 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-10-17T01:41:01.602561Z", | ||
"start_time": "2024-10-17T01:41:00.955263Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"users = fething_users_from_github(50)\n", | ||
"print(users)\n", | ||
"print(len(users))" | ||
], | ||
"id": "e781948fa8616f68", | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['karpathy', 'openai', 'rafaballerini', 'google', 'geohot', 'huggingface', 'michaelliao', 'llSourcell', 'taylorotwell', '3b1b', 'ry', 'krishnaik06', 'kennethreitz', 'buckyroberts', 'tiangolo', 'facebookresearch', 'rasbt', 'jwasham', 'gvanrossum', 'python', 'techwithtim', 'mitsuhiko', 'MorvanZhou', 'donnemartin', 'elyxdev', 'Visualize-ML', 'BEPb', 'jakevdp', 'liyupi', 'fchollet', 'tensorflow', 'iam-veeramalla', 'chiphuyen', 'wesm', 'ageron', 'lllyasviel', 'goodfeli', 'fengdu78', 'breakwa11', 'angusshire', 'miguelgrinberg', 'leerob', 'aws-samples', 'Stability-AI', 'JohnHammond', 'GoogleCloudPlatform', 'jrohitofficial', 'amueller', 'htr-tech', 'mnielsen']\n", | ||
"50\n" | ||
] | ||
} | ||
], | ||
"execution_count": 78 | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,128 +1,39 @@ | ||
import asyncio | ||
import aiohttp | ||
import time | ||
import os | ||
import logging | ||
from typing import List | ||
from dotenv import load_dotenv | ||
|
||
# Load environment variables from .env file | ||
load_dotenv() | ||
|
||
# Configure logging | ||
logging.basicConfig( | ||
level=logging.INFO, # Set to INFO for general logs; use DEBUG for more verbosity | ||
format='%(asctime)s [%(levelname)s] %(message)s', | ||
handlers=[ | ||
logging.FileHandler("follow_users.log"), | ||
logging.StreamHandler() | ||
] | ||
) | ||
|
||
# Constants | ||
USERNAMES_FILE = os.getenv('USERNAMES_FILE', 'usernames.txt') | ||
LAST_LINE_FILE = os.getenv('LAST_LINE_FILE', 'last_line.txt') | ||
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN') | ||
|
||
if not GITHUB_TOKEN: | ||
logging.error("GitHub token not found. Please set GITHUB_TOKEN in the environment variables.") | ||
exit(1) | ||
from datetime import time | ||
|
||
# Semaphore to limit concurrent requests (set to 1 for sequential processing) | ||
SEM = asyncio.Semaphore(1) | ||
|
||
# Function to read usernames from a file | ||
def read_usernames(file_path: str) -> List[str]: | ||
try: | ||
with open(file_path, 'r') as file: | ||
usernames = [line.strip() for line in file if line.strip()] | ||
logging.info(f"Loaded {len(usernames)} usernames from '{file_path}'.") | ||
return usernames | ||
except FileNotFoundError: | ||
logging.error(f"Usernames file '{file_path}' not found.") | ||
exit(1) | ||
except Exception as e: | ||
logging.exception(f"An error occurred while reading '{file_path}': {e}") | ||
exit(1) | ||
|
||
# Function to read the last processed line number | ||
def read_last_line(file_path: str) -> int: | ||
if os.path.exists(file_path): | ||
try: | ||
with open(file_path, 'r') as file: | ||
last_line = int(file.read().strip()) | ||
logging.info(f"Resuming from line {last_line + 1}.") | ||
return last_line | ||
except ValueError: | ||
logging.warning(f"Invalid content in '{file_path}'. Starting from the beginning.") | ||
return 0 | ||
except Exception as e: | ||
logging.exception(f"An error occurred while reading '{file_path}': {e}") | ||
return 0 | ||
logging.info(f"No last line file found. Starting from the beginning.") | ||
return 0 | ||
|
||
# Function to write the last processed line number | ||
def write_last_line(file_path: str, line_number: int) -> None: | ||
try: | ||
with open(file_path, 'w') as file: | ||
file.write(str(line_number)) | ||
logging.debug(f"Updated last line to {line_number} in '{file_path}'.") | ||
except Exception as e: | ||
logging.exception(f"An error occurred while writing to '{file_path}': {e}") | ||
import requests | ||
from fetching_new_users import fething_users_from_github | ||
import logging | ||
import dotenv | ||
|
||
# Asynchronous function to follow a user on GitHub | ||
async def follow_user(session: aiohttp.ClientSession, username: str, line_number: int) -> None: | ||
url = f'https://api.github.com/user/following/{username}' | ||
async with SEM: # Ensure sequential processing | ||
try: | ||
async with session.put(url) as response: | ||
status = response.status | ||
text = await response.text() | ||
|
||
if status == 204: | ||
logging.info(f"Line {line_number + 1}: Successfully followed '{username}'.") | ||
elif status == 404: | ||
logging.warning(f"Line {line_number + 1}: User '{username}' not found.") | ||
elif status == 403 or status == 429: | ||
logging.error(f"Line {line_number + 1}: Rate limit exceeded or forbidden access.") | ||
else: | ||
logging.error(f"Line {line_number + 1}: Failed to follow '{username}': {status}, {text}") | ||
dotenv.load_dotenv() | ||
USERNAMES_FILE = 'usernames.txt' | ||
LAST_LINE_FILE = 'last_line.txt' | ||
github_token = os.getenv("GITHUB_TOKEN")# your github token | ||
|
||
except Exception as e: | ||
logging.exception(f"Line {line_number + 1}: Error following user '{username}': {e}") | ||
### fetch 100 users from github | ||
|
||
# Main asynchronous function | ||
async def main(): | ||
usernames = read_usernames(USERNAMES_FILE) | ||
last_line = read_last_line(LAST_LINE_FILE) | ||
total_usernames = len(usernames) | ||
logging.info(f"Starting to follow users from line {last_line + 1} to {total_usernames}.") | ||
users = fething_users_from_github(100, github_token) | ||
|
||
headers = { | ||
'Authorization': f'token {GITHUB_TOKEN}', | ||
'Accept': 'application/vnd.github.v3+json', | ||
'User-Agent': 'GitHub-Follow-Script' # Replace with your application's name | ||
} | ||
### write the users to a file | ||
def write_users_to_file(users): | ||
with open(USERNAMES_FILE, 'w') as file: | ||
for user in users: | ||
file.write(f"{user}\n") | ||
|
||
async with aiohttp.ClientSession(headers=headers) as session: | ||
for i, username in enumerate(usernames[last_line:], start=last_line): | ||
await follow_user(session, username, i) | ||
### read the users from the file | ||
|
||
# Wait for 10 seconds before processing the next user | ||
if i < total_usernames - 1: | ||
#logging.info("Waiting for 10 seconds before following the next user...") | ||
await asyncio.sleep(10) | ||
### follow the users | ||
|
||
# Update the last processed line | ||
write_last_line(LAST_LINE_FILE, i + 1) | ||
### mark the last user followed | ||
|
||
logging.info("Finished processing all usernames.") | ||
### repeat the process - main loop | ||
|
||
if __name__ == "__main__": | ||
try: | ||
asyncio.run(main()) | ||
except KeyboardInterrupt: | ||
logging.info("Script interrupted by user.") | ||
except Exception as e: | ||
logging.exception(f"An unexpected error occurred: {e}") | ||
def main(): | ||
while True: | ||
users = fething_users_from_github(100, github_token) | ||
write_users_to_file(users) | ||
logging.info(f"Following {len(users)} users.") | ||
logging.info(f"Waiting for 10 minutes...") | ||
time.sleep(600) |
Oops, something went wrong.