diff --git a/analytics/.gitignore b/analytics/.gitignore deleted file mode 100644 index e934adfd1..000000000 --- a/analytics/.gitignore +++ /dev/null @@ -1 +0,0 @@ -cache/ diff --git a/analytics/cubinsizes.py b/analytics/cubinsizes.py deleted file mode 100755 index 33875057a..000000000 --- a/analytics/cubinsizes.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python3 -# Tool for analyzing sizes of CUDA kernels for various GPU architectures -import os -import struct -import subprocess -import sys -from tempfile import TemporaryDirectory -from typing import Dict - - -# Try to auto-import elftools -try: - from elftools.elf.elffile import ELFFile -except ModuleNotFoundError: - print(f'elftools module not found, trying to install it from pip') - from pip._internal import main as pip_main - try: - pip_main(["install", "pyelftools", "--user"]) - except SystemExit: - print(f'PIP installation failed, please install it manually by invoking "{sys.executable} -mpip install pyelftools --user"') - sys.exit(-1) - from elftools.elf.elffile import ELFFile - - -# From https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size -def sizeof_fmt(num, suffix='B'): - for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: - if abs(num) < 1024.0: - return "%3.1f%s%s" % (num, unit, suffix) - num /= 1024.0 - return "%.1f%s%s" % (num, 'Yi', suffix) - - -def compute_cubin_sizes(file_name, section_name='.nv_fatbin', debug=False): - with open(file_name, 'rb') as f: - elf_file = ELFFile(f) - nv_fatbin = elf_file.get_section_by_name(section_name) - if nv_fatbin is None: - return {} - data = nv_fatbin.data() - idx, offs = 0, 0 - elf_sizes = {} - while offs < len(data): - (magic, version, header_size, fatbin_size) = struct.unpack('IHHL', data[offs: offs + 16]) - if magic != 0xba55ed50 or version != 1: - raise RuntimeError(f"Unexpected fatbin magic {hex(magic)} or version {version}") - if debug: - print(f"Found fatbin at {offs} header_size={header_size} fatbin_size={fatbin_size}") - offs += header_size - fatbin_end = offs + fatbin_size - while offs < fatbin_end: - (kind, version, hdr_size, elf_size, empty, code_ver, sm_ver) = struct.unpack('HHILLIH', data[offs: offs + 30]) - if version != 0x0101 or kind not in [1, 2]: - raise RuntimeError(f"Unexpected cubin version {hex(version)} or kind {kind}") - sm_ver = f'{"ptx" if kind == 1 else "sm"}_{sm_ver}' - if debug: - print(f" {idx}: elf_size={elf_size} code_ver={hex(code_ver)} sm={sm_ver}") - if sm_ver not in elf_sizes: - elf_sizes[sm_ver] = 0 - elf_sizes[sm_ver] += elf_size - idx, offs = idx + 1, offs + hdr_size + elf_size - offs = fatbin_end - return elf_sizes - - -class ArFileCtx: - def __init__(self, ar_name: str) -> None: - self.ar_name = os.path.abspath(ar_name) - self._tmpdir = TemporaryDirectory() - - def __enter__(self) -> str: - self._pwd = os.getcwd() - rc = self._tmpdir.__enter__() - subprocess.check_call(['ar', 'x', self.ar_name]) - return rc - - def __exit__(self, ex, value, tb) -> None: - os.chdir(self._pwd) - return self._tmpdir.__exit__(ex, value, tb) - - -def dict_add(rc: Dict[str, int], b: Dict[str, int]) -> Dict[str, int]: - for key, val in b.items(): - rc[key] = (rc[key] if key in rc else 0) + val - return rc - - -def main(): - if sys.platform != 'linux': - print('This script only works with Linux ELF files') - return - if len(sys.argv) < 2: - print(f"{sys.argv[0]} invoked without any arguments trying to infer location of libtorch_cuda") - import torch - fname = os.path.join(os.path.dirname(torch.__file__), 'lib', 'libtorch_cuda.so') - else: - fname = sys.argv[1] - - if not os.path.exists(fname): - print(f"Can't find {fname}") - sys.exit(-1) - - section_names = ['.nv_fatbin', '__nv_relfatbin'] - results = {name: {} for name in section_names} - print(f"Analyzing {fname}") - if os.path.splitext(fname)[1] == '.a': - with ArFileCtx(fname): - for fname in os.listdir("."): - if not fname.endswith(".o"): continue - for section_name in section_names: - elf_sizes = compute_cubin_sizes(fname, section_name) - dict_add(results[section_name], elf_sizes) - else: - for section_name in ['.nv_fatbin', '__nv_relfatbin']: - dict_add(results[section_name], compute_cubin_sizes(fname, section_name)) - - for section_name in section_names: - elf_sizes = results[section_name] - print(f"{section_name} size {sizeof_fmt(sum(elf_sizes.values()))}") - for (sm_ver, total_size) in elf_sizes.items(): - print(f" {sm_ver}: {sizeof_fmt(total_size)}") - - -if __name__ == '__main__': - main() diff --git a/analytics/download_count_wheels.py b/analytics/download_count_wheels.py deleted file mode 100644 index d3562fb16..000000000 --- a/analytics/download_count_wheels.py +++ /dev/null @@ -1,163 +0,0 @@ -from collections import defaultdict -from datetime import datetime, timedelta, timezone -import gzip -import os -import re -import urllib - -from tqdm import tqdm -import boto3 - -S3 = boto3.resource('s3') -CLIENT = boto3.client('s3') -BUCKET = S3.Bucket('pytorch') - -class CacheEntry: - _size = None - - def __init__(self, download_uri: str): - self.download_uri = download_uri - self.bytes_sent = 0 - - @property - def os_type(self) -> str: - os_type = "linux" - if "win" in self.download_uri: - os_type = "windows" - elif "macosx" in self.download_uri: - os_type = "macos" - return os_type - - @property - def target_arch(self) -> str: - target_arch = "cpu" - result = re.search(r"cu[0-9]+", self.download_uri) - if result: - target_arch = result[0] - return target_arch - - @property - def package_name(self) -> str: - filename_contents = os.path.basename(self.download_uri).split('-') - return filename_contents[0] - - @property - def package_version(self) -> str: - if "dev" in self.download_uri: - results = re.search( - r"[0-9]+\.[0-9]+\.[0-9]+\.dev[0-9]+", - self.download_uri - ) - else: - results = re.search( - r"[0-9]+\.[0-9]+\.[0-9]+", self.download_uri - ) - if not results: - raise Exception("Wtf there's no version o.O") - return results[0] - - @property - def size(self) -> int: - if self._size is None: - for key in BUCKET.objects.filter( - Prefix=self.download_uri.lstrip("/") - ): - self._size = key.size - if self._size is None: - raise Exception( - f"No object found for prefix {self.download_uri}" - ) - return self._size - - @property - def downloads(self): - return self.bytes_sent // self.size - -def parse_logs(log_directory: str) -> dict: - bytes_cache = {} - for (dirpath, _, filenames) in os.walk(log_directory): - for filename in tqdm(filenames): - with gzip.open(os.path.join(dirpath, filename), 'r') as gf: - string = gf.read().decode("utf-8") - entries = [] - entries += string.splitlines()[2:] - for entry in entries: - columns = entry.split('\t') - bytes_sent = int(columns[3]) - download_uri = urllib.parse.unquote( - urllib.parse.unquote(columns[7]) - ) - status = columns[8] - if not all([ - status.startswith("2"), - download_uri.endswith((".whl", ".zip")) - ]): - continue - if not bytes_cache.get(download_uri): - bytes_cache[download_uri] = CacheEntry(download_uri) - bytes_cache[download_uri].bytes_sent += bytes_sent - return bytes_cache - -def output_results(bytes_cache: dict) -> None: - os_results = defaultdict(int) - arch_results = defaultdict(int) - package_results = defaultdict(lambda: defaultdict(int)) - for _, val in tqdm(bytes_cache.items()): - try: - os_results[val.os_type] += val.downloads - arch_results[val.target_arch] += val.downloads - package_results[val.package_name][val.package_version] += ( - val.downloads - ) - except Exception: - pass - print("=-=-= Results =-=-=") - print("=-=-= OS =-=-=") - total_os_num = sum(os_results.values()) - for os_type, num in os_results.items(): - print( - f"\t* {os_type}: {num} ({(num/total_os_num)*100:.2f}%)" - ) - - print("=-=-= ARCH =-=-=") - total_arch_num = sum(arch_results.values()) - for arch_type, num in arch_results.items(): - print( - f"\t* {arch_type}: {num} ({(num/total_arch_num) * 100:.2f}%)" - ) - - print("=-=-= By Package =-=-=") - for package_name, upper_val in package_results.items(): - print(f"=-=-= {package_name} =-=-=") - total_package_num = sum(upper_val.values()) - for package_version, num in upper_val.items(): - print( - f"\t* {package_version}: {num} ({(num/total_package_num) * 100:.2f}%)" - ) - -def download_logs(log_directory: str, since: float): - dt_now = datetime.now(timezone.utc) - dt_end = datetime(dt_now.year, dt_now.month, dt_now.day, tzinfo=timezone.utc) - dt_start = dt_end - timedelta(days=1, hours=1) # Add 1 hour padding to account for potentially missed logs due to timing - for key in tqdm(BUCKET.objects.filter(Prefix='cflogs')): - remote_fname = key.key - local_fname = os.path.join(log_directory, remote_fname) - # Only download things from yesterday - dt_modified = key.last_modified.replace(tzinfo=timezone.utc) - if dt_start >= dt_modified or dt_end < dt_modified: - continue - # TODO: Do this in parallel - if not os.path.exists(local_fname): - dirname = os.path.dirname(local_fname) - if not os.path.exists(dirname): - os.makedirs(dirname) - CLIENT.download_file("pytorch", remote_fname, local_fname) - - -if __name__ == "__main__": - print("Downloading logs") - download_logs('cache', 1) - print("Parsing logs") - cache = parse_logs('cache/cflogs/') - print("Calculating results") - output_results(cache) diff --git a/analytics/duplicates_analyze.py b/analytics/duplicates_analyze.py deleted file mode 100755 index 8fdc3af22..000000000 --- a/analytics/duplicates_analyze.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 -from typing import Dict, List -from subprocess import check_output -import os -import sys - - -def get_defined_symbols(fname: str, verbose: bool = False) -> Dict[str, int]: - if verbose: - print(f"Processing {fname}...", end='', flush=True) - if sys.platform == 'darwin': - lines = check_output(['nm', '--defined-only', '-n', fname]).decode('ascii').split("\n")[:-1] - rc = {} - for idx, line in enumerate(lines): - addr, stype, name = line.split(' ') - size = 4 if idx + 1 == len(lines) else (int(lines[idx + 1].split(' ')[0], 16) - int(addr, 16)) - rc[name] = size - else: - lines = check_output(['nm', '--print-size', '--defined-only', fname]).decode('ascii').split('\n') - rc = {e[3]: int(e[1], 16) for e in [line.split() for line in lines] if len(e) == 4} - if verbose: - print("done") - return rc - - -def get_deps(fname: str) -> List[str]: - if sys.platform == 'darwin': - rc = [] - lines = check_output(['otool', '-l', fname]).decode('ascii').split("\n")[1:-1] - for idx, line in enumerate(lines): - if line.strip() != 'cmd LC_LOAD_DYLIB': - continue - path = lines[idx + 2].strip() - assert path.startswith('name') - rc.append(os.path.basename(path.split(' ')[1])) - return rc - lines = check_output(['readelf', '--dynamic', fname]).decode('ascii').split('\n') - return [line.split('[')[1][:-1] for line in lines if '(NEEDED)' in line] - - -def humansize(size): - if size < 1024: - return f"{size} bytes" - if size < 1024**2: - return f"{int(size/1024)} Kb" - if size < 1024**3: - return f"{size/(1024.0**2):.2f} Mb" - return f"{size/(1024.0**3):.2f} Gb" - - -def print_sizes(libname, depth: int = 2) -> None: - libs = [libname] - depth = 2 - symbols = {os.path.basename(libname): get_defined_symbols(libname, verbose=True)} - for _ in range(depth): - for lib in libs: - dirname = os.path.dirname(lib) - for dep in get_deps(lib): - path = os.path.join(dirname, dep) - if not os.path.exists(path): - continue - if path not in libs: - libs.append(path) - symbols[dep] = get_defined_symbols(path, verbose=True) - - for lib in libs: - lib_symbols = symbols[os.path.basename(lib)] - lib_keys = set(lib_symbols.keys()) - rc = f"{lib} symbols size {humansize(sum(lib_symbols.values()))}" - for dep in get_deps(lib): - if dep not in symbols: - continue - dep_overlap = lib_keys.intersection(set(symbols[dep].keys())) - overlap_size = sum(lib_symbols[k] for k in dep_overlap) - if overlap_size > 0: - rc += f" {dep} overlap is {humansize(overlap_size)}" - print(rc) - - -def print_symbols_overlap(libname1: str, libname2: str) -> None: - sym1 = get_defined_symbols(libname1, verbose=True) - sym2 = get_defined_symbols(libname2, verbose=True) - sym1_size = sum(sym1.values()) - sym2_size = sum(sym2.values()) - sym_overlap = set(sym1.keys()).intersection(set(sym2.keys())) - overlap_size = sum(sym1[s] for s in sym_overlap) - if overlap_size == 0: - print(f"{libname1} symbols size {humansize(sym1_size)} does not overlap with {libname2}") - return - print(f"{libname1} symbols size {humansize(sym1_size)} overlap {humansize(overlap_size)} ({100.0 * overlap_size/sym1_size :.2f}%)") - for sym in sym_overlap: - print(sym) - - -if __name__ == '__main__': - if len(sys.argv) == 3: - print_symbols_overlap(sys.argv[1], sys.argv[2]) - else: - print_sizes(sys.argv[1] if len(sys.argv) > 1 else "lib/libtorch_cuda.so") diff --git a/analytics/github_analyze.py b/analytics/github_analyze.py deleted file mode 100755 index c6da4d8ca..000000000 --- a/analytics/github_analyze.py +++ /dev/null @@ -1,525 +0,0 @@ -#!/usr/bin/env python3 - -from datetime import datetime, timedelta -from typing import Any, Dict, List, Iterable, Optional, Union -from urllib.request import urlopen, Request -from urllib.error import HTTPError -import json -import enum -import os - -class IssueState(enum.Enum): - OPEN = "open" - CLOSED = "closed" - ALL = "all" - - def __str__(self): - return self.value - - -class GitCommit: - commit_hash: str - title: str - body: str - author: str - author_date: datetime - commit_date: Optional[datetime] - pr_url: str - - def __init__(self, - commit_hash: str, - author: str, - author_date: datetime, - title: str, - body: str, - pr_url : str, - commit_date: Optional[datetime] = None) -> None: - self.commit_hash = commit_hash - self.author = author - self.author_date = author_date - self.commit_date = commit_date - self.title = title - self.body = body - self.pr_url = pr_url - - def __contains__(self, item: Any) -> bool: - return item in self.body or item in self.title - - def is_issue_mentioned(self, issue_url: str) -> bool: - if issue_url in self: - return True - if "/pull/" in issue_url: - return False - issue_hash = f"#{issue_url.split('issues/')[1]}" - if "fixes" in self.title.lower() and issue_hash in self.title: - return True - return any("fixes" in line.lower() and issue_hash in line for line in self.body.split("\n")) - - -def get_revert_revision(commit: GitCommit) -> Optional[str]: - import re - body_rc = re.search("Original Phabricator Diff: (D\\d+)", commit.body) - - if commit.title.startswith("Back out \"") and body_rc is not None: - return body_rc.group(1) - - rc = re.match("Revert (D\\d+):", commit.title) - if rc is None: - return None - return rc.group(1) - - -def get_diff_revision(commit: GitCommit) -> Optional[str]: - import re - rc = re.search("\\s*Differential Revision: (D\\d+)", commit.body) - if rc is None: - return None - return rc.group(1) - - -def get_ghf_revert_revision(commit: GitCommit) -> Optional[str]: - import re - rc = re.search("\\s*This reverts commit ([0-9a-f]+).", commit.body) - if all([ - commit.title.startswith("Revert"), - commit.author == "PyTorch MergeBot ", - rc is not None - ]): - return rc.group(1) - return None - - -def is_revert(commit: GitCommit) -> bool: - return get_revert_revision(commit) is not None or get_ghf_revert_revision(commit) is not None - - -def parse_medium_format(lines: Union[str, List[str]]) -> GitCommit: - """ - Expect commit message generated using `--format=medium --date=unix` format, i.e.: - commit - Author: - Date: - - - - <full commit message> - - """ - if isinstance(lines, str): - lines = lines.split("\n") - # TODO: Handle merge commits correctly - if len(lines) > 1 and lines[1].startswith("Merge:"): - del lines[1] - assert len(lines) > 5 - assert lines[0].startswith("commit") - assert lines[1].startswith("Author: ") - assert lines[2].startswith("Date: ") - assert len(lines[3]) == 0 - return GitCommit(commit_hash=lines[0].split()[1].strip(), - author=lines[1].split(":", 1)[1].strip(), - author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), - title=lines[4].strip(), - body="\n".join(lines[5:]), - ) - - -def parse_fuller_format(lines: Union[str, List[str]]) -> GitCommit: - """ - Expect commit message generated using `--format=fuller --date=unix` format, i.e.: - commit <sha1> - Author: <author> - AuthorDate: <author date> - Commit: <committer> - CommitDate: <committer date> - - <title line> - - <full commit message> - - """ - if isinstance(lines, str): - lines = lines.split("\n") - # TODO: Handle merge commits correctly - if len(lines) > 1 and lines[1].startswith("Merge:"): - del lines[1] - assert len(lines) > 7 - assert lines[0].startswith("commit") - assert lines[1].startswith("Author: ") - assert lines[2].startswith("AuthorDate: ") - assert lines[3].startswith("Commit: ") - assert lines[4].startswith("CommitDate: ") - assert len(lines[5]) == 0 - - prUrl = "" - for line in lines: - if "Pull Request resolved:" in line: - prUrl = line.split("Pull Request resolved:")[1].strip() - break - - return GitCommit(commit_hash=lines[0].split()[1].strip(), - author=lines[1].split(":", 1)[1].strip(), - author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), - commit_date=datetime.fromtimestamp(int(lines[4].split(":", 1)[1].strip())), - title=lines[6].strip(), - body="\n".join(lines[7:]), - pr_url=prUrl, - ) - - -def _check_output(items: List[str], encoding='utf-8') -> str: - from subprocess import check_output - return check_output(items).decode(encoding) - - -def get_git_remotes(path: str) -> Dict[str, str]: - keys = _check_output(["git", "-C", path, "remote"]).strip().split("\n") - return {key: _check_output(["git", "-C", path, "remote", "get-url", key]).strip() for key in keys} - - -class GitRepo: - def __init__(self, path, remote='upstream'): - self.repo_dir = path - self.remote = remote - - def _run_git_cmd(self, *args) -> str: - return _check_output(['git', '-C', self.repo_dir] + list(args)) - - def _run_git_log(self, revision_range) -> List[GitCommit]: - log = self._run_git_cmd('log', '--format=fuller', - '--date=unix', revision_range, '--', '.').split("\n") - rc: List[GitCommit] = [] - cur_msg: List[str] = [] - for line in log: - if line.startswith("commit"): - if len(cur_msg) > 0: - rc.append(parse_fuller_format(cur_msg)) - cur_msg = [] - cur_msg.append(line) - if len(cur_msg) > 0: - rc.append(parse_fuller_format(cur_msg)) - return rc - - def get_commit_list(self, from_ref, to_ref) -> List[GitCommit]: - return self._run_git_log(f"{self.remote}/{from_ref}..{self.remote}/{to_ref}") - - def get_ghstack_orig_branches(self) -> List[str]: - return [x.strip() for x in self._run_git_cmd("branch", "--remotes", "--list", self.remote + "/gh/*/orig").strip().split("\n")] - - def show_ref(self, ref) -> str: - return self._run_git_cmd("show-ref", ref).split(" ")[0] - - def merge_base(self, ref1, ref2) -> str: - return self._run_git_cmd("merge-base", ref1, ref2).strip() - - def rev_list(self, ref): - return self._run_git_cmd("rev-list", f"{self.remote}/main..{ref}").strip().split() - - -def build_commit_dict(commits: List[GitCommit]) -> Dict[str, GitCommit]: - rc = {} - for commit in commits: - assert commit.commit_hash not in rc - rc[commit.commit_hash] = commit - return rc - - -def fetch_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: - headers = {'Accept': 'application/vnd.github.v3+json'} - token = os.environ.get("GITHUB_TOKEN") - if token is not None and url.startswith('https://api.github.com/'): - headers['Authorization'] = f'token {token}' - if params is not None and len(params) > 0: - url += '?' + '&'.join(f"{name}={val}" for name, val in params.items()) - try: - with urlopen(Request(url, headers=headers)) as data: - return json.load(data) - except HTTPError as err: - if err.code == 403 and all(key in err.headers for key in ['X-RateLimit-Limit', 'X-RateLimit-Used']): - print(f"Rate limit exceeded: {err.headers['X-RateLimit-Used']}/{err.headers['X-RateLimit-Limit']}") - raise - -def fetch_multipage_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: - if params is None: - params = {} - assert "page" not in params - page_idx, rc, prev_len, params = 1, [], -1, params.copy() - while len(rc) > prev_len: - prev_len = len(rc) - params["page"] = page_idx - page_idx += 1 - rc += fetch_json(url, params) - return rc - - -def gh_get_milestones(org='pytorch', project='pytorch', state: IssueState = IssueState.OPEN) -> List[Dict[str, Any]]: - url = f'https://api.github.com/repos/{org}/{project}/milestones' - return fetch_multipage_json(url, {"state": state}) - -def gh_get_milestone_issues(org: str, project: str, milestone_idx: int, state: IssueState = IssueState.OPEN): - url = f'https://api.github.com/repos/{org}/{project}/issues' - return fetch_multipage_json(url, {"milestone": milestone_idx, "state": state}) - - -def gh_get_ref_statuses(org: str, project: str, ref: str) -> Dict[str, Any]: - url = f'https://api.github.com/repos/{org}/{project}/commits/{ref}/status' - params = {"page": 1, "per_page": 100} - nrc = rc = fetch_json(url, params) - while "statuses" in nrc and len(nrc["statuses"]) == 100: - params["page"] += 1 - nrc = fetch_json(url, params) - if "statuses" in nrc: - rc["statuses"] += nrc["statuses"] - return rc - -def get_issue_comments(org: str, project: str, issue_number : int): - url = f'https://api.github.com/repos/{org}/{project}/issues/{issue_number}/comments' - - return fetch_multipage_json(url) - -def extract_statuses_map(json: Dict[str, Any]): - return {s["context"]: s["state"] for s in json["statuses"]} - - -class PeriodStats: - commits: int - reverts: int - authors: int - date: datetime - - def __init__(self, date: datetime, commits: int, reverts: int, authors: int) -> None: - self.date = date - self.commits = commits - self.reverts = reverts - self.authors = authors - - -def get_monthly_stats(commits: List[GitCommit]) -> Iterable[PeriodStats]: - y, m, total, reverts, authors = None, None, 0, 0, set() - for commit in commits: - commit_date = commit.commit_date if commit.commit_date is not None else commit.author_date - if y != commit_date.year or m != commit_date.month: - if y is not None: - yield PeriodStats(datetime(y, m, 1), total, reverts, len(authors)) - y, m, total, reverts, authors = commit_date.year, commit_date.month, 0, 0, set() - if is_revert(commit): - reverts += 1 - total += 1 - authors.add(commit.author) - - -def print_monthly_stats(commits: List[GitCommit]) -> None: - stats = list(get_monthly_stats(commits)) - for idx, stat in enumerate(stats): - y = stat.date.year - m = stat.date.month - total, reverts, authors = stat.commits, stat.reverts, stat.authors - reverts_ratio = 100.0 * reverts / total - if idx + 1 < len(stats): - commits_growth = 100.0 * (stat.commits / stats[idx + 1].commits - 1) - else: - commits_growth = float('nan') - print(f"{y}-{m:02d}: commits {total} ({commits_growth:+.1f}%) reverts {reverts} ({reverts_ratio:.1f}%) authors {authors}") - - -def print_reverts(commits: List[GitCommit]) -> None: - for commit in commits: - if not is_revert(commit): - continue - print(f"{commit.commit_date} {commit.title} {commit.commit_hash} {commit.body}") - - -def analyze_reverts(commits: List[GitCommit]): - for idx, commit in enumerate(commits): - revert_id = get_revert_revision(commit) - if revert_id is None: - continue - orig_commit = None - for i in range(1, 100): - orig_commit = commits[idx + i] - if get_diff_revision(orig_commit) == revert_id: - break - if orig_commit is None: - print(f"Failed to find original commit for {commit.title}") - continue - print(f"{commit.commit_hash} is a revert of {orig_commit.commit_hash}: {orig_commit.title}") - revert_statuses = gh_get_ref_statuses("pytorch", "pytorch", commit.commit_hash) - orig_statuses = gh_get_ref_statuses("pytorch", "pytorch", orig_commit.commit_hash) - orig_sm = extract_statuses_map(orig_statuses) - revert_sm = extract_statuses_map(revert_statuses) - for k in revert_sm.keys(): - if k not in orig_sm: - continue - if orig_sm[k] != revert_sm[k]: - print(f"{k} {orig_sm[k]}->{revert_sm[k]}") - - -def print_contributor_stats(commits, delta: Optional[timedelta] = None) -> None: - authors: Dict[str, int] = {} - now = datetime.now() - # Default delta is one non-leap year - if delta is None: - delta = timedelta(days=365) - for commit in commits: - date, author = commit.commit_date, commit.author - if now - date > delta: - break - if author not in authors: - authors[author] = 0 - authors[author] += 1 - - print(f"{len(authors)} contributors made {sum(authors.values())} commits in last {delta.days} days") - for count, author in sorted(((commit, author) for author, commit in authors.items()), reverse=True): - print(f"{author}: {count}") - - -def commits_missing_in_branch(repo: GitRepo, branch: str, orig_branch: str, milestone_idx: int) -> None: - def get_commits_dict(x, y): - return build_commit_dict(repo.get_commit_list(x, y)) - main_commits = get_commits_dict(orig_branch, 'main') - release_commits = get_commits_dict(orig_branch, branch) - print(f"len(main_commits)={len(main_commits)}") - print(f"len(release_commits)={len(release_commits)}") - print("URL;Title;Status") - for issue in gh_get_milestone_issues('pytorch', 'pytorch', milestone_idx, IssueState.ALL): - issue_url, state = issue["html_url"], issue["state"] - # Skip closed states if they were landed before merge date - if state == "closed": - mentioned_after_cut = any(commit.is_issue_mentioned(issue_url) for commit in main_commits.values()) - # If issue is not mentioned after cut, that it must be present in release branch - if not mentioned_after_cut: - continue - mentioned_in_release = any(commit.is_issue_mentioned(issue_url) for commit in release_commits.values()) - # if Issue is mentioned is release branch, than it was picked already - if mentioned_in_release: - continue - print(f'{issue_url};{issue["title"]};{state}') - -def commits_missing_in_release(repo: GitRepo, branch: str, orig_branch: str, minor_release: str, milestone_idx: int, cut_off_date : datetime, issue_num : int) -> None: - def get_commits_dict(x, y): - return build_commit_dict(repo.get_commit_list(x, y)) - main_commits = get_commits_dict(minor_release, 'main') - prev_release_commits = get_commits_dict(orig_branch, branch) - current_issue_comments = get_issue_comments('pytorch', 'pytorch',issue_num) # issue comments for the release tracker as cherry picks - print(f"len(main_commits)={len(main_commits)}") - print(f"len(prev_release_commits)={len(prev_release_commits)}") - print(f"len(current_issue_comments)={len(current_issue_comments)}") - print(f"issue_num: {issue_num}, len(issue_comments)={len(current_issue_comments)}") - print("URL;Title;Status") - - # Iterate over the previous release branch to find potentially missing cherry picks in the current issue. - for commit in prev_release_commits.values(): - not_cherry_picked_in_current_issue = any(commit.pr_url not in issue_comment['body'] for issue_comment in current_issue_comments) - for main_commit in main_commits.values(): - if main_commit.pr_url == commit.pr_url : - mentioned_after_cut_off_date = cut_off_date < main_commit.commit_date - if not_cherry_picked_in_current_issue and mentioned_after_cut_off_date: - # Commits that are release only, which exist in previous release branch and not in main. - print(f'{commit.pr_url};{commit.title};{commit.commit_date}') - break - -def analyze_stacks(repo: GitRepo) -> None: - from tqdm.contrib.concurrent import thread_map - branches = repo.get_ghstack_orig_branches() - stacks_by_author: Dict[str, List[int]] = {} - for branch,rv_commits in thread_map(lambda x: (x, repo.rev_list(x)), branches, max_workers=10): - author = branch.split("/")[2] - if author not in stacks_by_author: - stacks_by_author[author]=[] - stacks_by_author[author].append(len(rv_commits)) - for author, slen in sorted(stacks_by_author.items(), key=lambda x:len(x[1]), reverse=True): - if len(slen) == 1: - print(f"{author} has 1 stack of depth {slen[0]}") - continue - print(f"{author} has {len(slen)} stacks max depth is {max(slen)} avg depth is {sum(slen)/len(slen):.2f} mean is {slen[len(slen)//2]}") - - -def parse_arguments(): - from argparse import ArgumentParser - parser = ArgumentParser(description="Print GitHub repo stats") - parser.add_argument("--repo-path", - type=str, - help="Path to PyTorch git checkout", - default=os.path.expanduser("~/git/pytorch/pytorch")) - parser.add_argument("--milestone-id", type=str) - parser.add_argument("--branch", type=str) - parser.add_argument("--minor-release", type=str) - parser.add_argument("--remote", - type=str, - help="Remote to base off of", - default="") - parser.add_argument("--analyze-reverts", action="store_true") - parser.add_argument("--print-reverts", action="store_true") - parser.add_argument("--contributor-stats", action="store_true") - parser.add_argument("--missing-in-branch", action="store_true") - parser.add_argument("--missing-in-release", action="store_true") - parser.add_argument("--analyze-stacks", action="store_true") - parser.add_argument('--date', type=lambda d: datetime.strptime(d, '%Y-%m-%d')) - parser.add_argument("--issue-num", type=int) - return parser.parse_args() - - -def main(): - import time - args = parse_arguments() - remote = args.remote - if not remote: - remotes = get_git_remotes(args.repo_path) - # Pick best remote - remote = next(iter(remotes.keys())) - for key in remotes: - if remotes[key].endswith('github.com/pytorch/pytorch'): - remote = key - - repo = GitRepo(args.repo_path, remote) - - if args.analyze_stacks: - analyze_stacks(repo) - return - - # Use milestone idx or search it along milestone titles - try: - milestone_idx = int(args.milestone_id) - except ValueError: - milestone_idx = -1 - milestones = gh_get_milestones() - for milestone in milestones: - if milestone.get('title', '') == args.milestone_id: - milestone_idx = int(milestone.get('number', '-2')) - if milestone_idx < 0: - print(f'Could not find milestone {args.milestone_id}') - return - - if args.missing_in_branch: - commits_missing_in_branch(repo, - args.branch, - f'orig/{args.branch}', - milestone_idx) - return - - if args.missing_in_release: - commits_missing_in_release(repo, - args.branch, - f'orig/{args.branch}', - args.minor_release, - milestone_idx, - args.date, - args.issue_num - ) - return - - print(f"Parsing git history with remote {remote}...", end='', flush=True) - start_time = time.time() - x = repo._run_git_log(f"{remote}/main") - print(f"done in {time.time()-start_time:.1f} sec") - if args.analyze_reverts: - analyze_reverts(x) - elif args.contributor_stats: - print_contributor_stats(x) - elif args.print_reverts: - print_reverts(x[:2**9]) - else: - print_monthly_stats(x) - - -if __name__ == "__main__": - main() diff --git a/analytics/s3_test_stats_analyze.py b/analytics/s3_test_stats_analyze.py deleted file mode 100644 index 74b4f6de8..000000000 --- a/analytics/s3_test_stats_analyze.py +++ /dev/null @@ -1,147 +0,0 @@ -import argparse -import boto3 -import bz2 -import json -import os -import re -import requests - -import pandas as pd - -from datetime import datetime, timedelta -from tqdm import tqdm -from typing import Any, Dict, Optional, List - -S3 = boto3.resource('s3') -CLIENT = boto3.client('s3') -BUCKET = S3.Bucket('ossci-metrics') - -GITHUB_API_BASE = "https://api.github.com/" -GITHUB_COMMITS_API = "repos/pytorch/pytorch/commits" -STRF_FORMAT = "%Y-%m-%dT%H:%M:%SZ" - -CACHE_PICKLE = "cache/test_time/dataframe.pickle" - -def _get_latests_git_commit_sha_list(lookback: int): - sha_since = (datetime.utcnow() - timedelta(hours = lookback)).strftime(STRF_FORMAT) - resp = requests.get(GITHUB_API_BASE + GITHUB_COMMITS_API + f"?since={sha_since}") - if resp.status_code == 200: - return [e.get('sha') for e in resp.json()] - else: - return [] - -def _json_to_df(data: Dict[str, Any], granularity: str) -> pd.DataFrame: - reformed_data = list() - for fname, fdata in data['files'].items(): - if granularity == 'file': - reformed_data.append({ - "job": data['job'], - "sha": data['sha'], - 'file': fname, - 'file_total_sec': fdata['total_seconds'], - }) - else: - for sname, sdata in fdata['suites'].items(): - if granularity == 'suite': - reformed_data.append({ - "job": data['job'], - "sha": data['sha'], - 'suite': sname, - 'suite_total_sec': sdata['total_seconds'], - }) - else: - for cname, cdata in sdata['cases'].items(): - reformed_data.append({ - "job": data['job'], - "sha": data['sha'], - 'case': cname, - 'case_status': cdata['status'], - 'case_sec': cdata['seconds'], - }) - df = pd.json_normalize(reformed_data) - return df - - -def download_stats(folder: str, lookback: int): - commit_sha_list = _get_latests_git_commit_sha_list(lookback) - for commit_sha in commit_sha_list: - for key in tqdm(BUCKET.objects.filter(Prefix=f'test_time/{commit_sha}')): - remote_fname = key.key - local_fname = os.path.join(folder, remote_fname) - # TODO: Do this in parallel - if not os.path.exists(local_fname): - dirname = os.path.dirname(local_fname) - if not os.path.exists(dirname): - os.makedirs(dirname) - # only download when there's a cache miss - if not os.path.exists(local_fname) or not os.path.isfile(local_fname): - print(f"\nDownloading {remote_fname}...") - CLIENT.download_file("ossci-metrics", remote_fname, local_fname) - - -def parse_and_export_stats(folder: str, granularity: str, commit_sha_lists: Optional[List[str]] = None): - dataframe = None - for (dirpath, _, filenames) in os.walk(folder): - for filename in tqdm(filenames): - splits = dirpath.split("/") - job_name = splits[-1] - sha = splits[-2] - if not commit_sha_lists or sha in commit_sha_lists: - with bz2.open(os.path.join(dirpath, filename), 'r') as zf: - string = zf.read().decode("utf-8") - data = json.loads(string) - # create a deep json with sha and job info - data['sha'] = sha - data['job'] = job_name - df = _json_to_df(data, granularity) - dataframe = df if dataframe is None else dataframe.append(df) - return dataframe - - -def main(): - parser = argparse.ArgumentParser( - __file__, - description="download and cache test stats locally, both raw and pandas format", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - '--lookback', - type=int, - help='lookback in # of hours', - default=24, - ) - parser.add_argument( - '--output', - help='output filename', - default='cache/df.pickle', - ) - parser.add_argument( - '--cache_folder', - help='cache folder', - default='cache', - ) - parser.add_argument( - '--granularity', - choices=['file', 'suite', 'case'], - help='granularity of stats summary', - default='file', - ) - args = parser.parse_args() - - lookback = args.lookback - cache_folder = args.cache_folder - output = args.output - granularity = args.granularity - - print("Downloading test stats") - download_stats(cache_folder, lookback) - print("Parsing test stats and write to pd dataframe") - if not os.path.exists(output): - dataframe = parse_and_export_stats(f'{cache_folder}/test_time/', granularity) - dataframe.to_pickle(output) - - - -if __name__ == "__main__": - main() - diff --git a/analytics/validate_binaries.py b/analytics/validate_binaries.py deleted file mode 100644 index 65965c59a..000000000 --- a/analytics/validate_binaries.py +++ /dev/null @@ -1,77 +0,0 @@ -from conda.cli.python_api import Commands, run_command -from tabulate import tabulate -from datetime import datetime -import json - -PLATFORMS = ["osx-64", "linux-64", "win-64"] -PYTHON_VERSIONS = ["3.10", "3.9", "3.8", "3.7"] -CUDA_CUDNN_VERSION = [ - ("11.7", "8.5.0"), ("cpu", None) -] -CHANNEL = "pytorch-test" -VERSION = "1.13.*" - - -def generate_expected_builds(platform: str) -> set: - builds = set() - for py_version in PYTHON_VERSIONS: - if platform == "osx-64": - # macos builds support cpu only. - builds.add(f"py{py_version}_0") - continue - - for cuda_version, cudnn_version in CUDA_CUDNN_VERSION: - if platform == "win-64": - cudnn_version = "8" - - if cuda_version == "cpu": - builds.add(f"py{py_version}_{cuda_version}_0") - else: - builds.add(f"py{py_version}_cuda{cuda_version}_cudnn{cudnn_version}_0") - return builds - - -def size_format(size_num) -> str: - for unit in ["", "K", "M", "G"]: - if abs(size_num) < 1024.0: - return f"{size_num:3.1f}{unit}B" - - size_num /= 1024.0 - return f"{size_num:3.1f}TB" - - -def main() -> None: - # Iterate over platform to gather build information of available conda version. - for platform in PLATFORMS: - expected_builds = generate_expected_builds(platform) - - # Actual builds available in Conda - stdout, stderr, return_code = run_command( - Commands.SEARCH, f"{CHANNEL}::*[name=pytorch version={VERSION} subdir={platform}]", "--json") - - if return_code != 0: - raise Exception(stderr) - - available_versions = json.loads(stdout) - output_data = [] - headers = ["File Name", "Date", "Size"] - actual_builds = set() - for version in available_versions["pytorch"]: - actual_builds.add(version["build"]) - output_data.append(( - version["fn"], - datetime.fromtimestamp(version["timestamp"] / 1000), - size_format(version["size"]) - )) - - assert len(expected_builds) > 0, "expected builds set should not be empty." - assert expected_builds == actual_builds, ( - f"Missing following builds in conda: {expected_builds.difference(actual_builds)} for platform {platform}" - ) - - print(f"\nSuccessfully verified following binaries are available in Conda for {platform}...") - print(tabulate(output_data, headers=headers, tablefmt="grid")) - - -if __name__ == "__main__": - main() diff --git a/analytics/validate_pypi_staging.py b/analytics/validate_pypi_staging.py deleted file mode 100644 index 5321313df..000000000 --- a/analytics/validate_pypi_staging.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python3 - -import os.path -import shutil -import subprocess -import tempfile -import zipfile - -import boto3 -import botocore - -PLATFORMS = [ - "manylinux1_x86_64", - "manylinux2014_aarch64", - "win_amd64", - "macosx_11_0_arm64", -] -PYTHON_VERSIONS = [ - "cp38", - "cp39", - "cp310", - "cp311", - "cp312" - ] -S3_PYPI_STAGING = "pytorch-backup" -PACKAGE_RELEASES = { - "torch": "2.3.1", - "torchvision": "0.18.1", - "torchaudio": "2.3.1", - "torchtext": "0.18.0", - "executorch": "0.2.1" -} - -PATTERN_V = "Version:" -PATTERN_RD = "Requires-Dist:" - -s3 = boto3.client("s3") - - -def get_size(path): - size = os.path.getsize(path) - if size < 1024: - return f"{size} bytes" - elif size < pow(1024, 2): - return f"{round(size/1024, 2)} KB" - elif size < pow(1024, 3): - return f"{round(size/(pow(1024,2)), 2)} MB" - elif size < pow(1024, 4): - return f"{round(size/(pow(1024,3)), 2)} GB" - - -def generate_expected_builds(platform: str, package: str, release: str) -> list: - builds = [] - for py_version in PYTHON_VERSIONS: - py_spec = f"{py_version}-{py_version}" - platform_spec = platform - - if package == "torchtext" and ( - platform == "manylinux2014_aarch64" or py_version == "cp312" - ): - continue - - # strange macos file nameing - if "macos" in platform: - if package == "torch": - py_spec = f"{py_version}-none" - elif "macosx_10_9_x86_64" in platform: - platform_spec = "macosx_10_13_x86_64" - - builds.append( - f"{package}-{release}-pypi-staging/{package}-{release}-{py_spec}-{platform_spec}.whl" - ) - - return builds - - -def validate_file_metadata(build: str, package: str, version: str): - temp_dir = tempfile.mkdtemp() - tmp_file = f"{temp_dir}/{os.path.basename(build)}" - s3.download_file(Bucket=S3_PYPI_STAGING, Key=build, Filename=tmp_file) - print(f"Downloaded: {tmp_file} {get_size(tmp_file)}") - - try: - check_wheels = subprocess.run( - ["check-wheel-contents", tmp_file, "--ignore", "W002,W009,W004"], - capture_output=True, - text=True, - check=True, - encoding="utf-8", - ) - print(check_wheels.stdout) - print(check_wheels.stderr) - except subprocess.CalledProcessError as e: - exit_code = e.returncode - stderror = e.stderr - print(exit_code, stderror) - - with zipfile.ZipFile(f"{temp_dir}/{os.path.basename(build)}", "r") as zip_ref: - zip_ref.extractall(f"{temp_dir}") - - for i, line in enumerate( - open(f"{temp_dir}/{package}-{version}.dist-info/METADATA") - ): - if line.startswith(PATTERN_V): - print(f"{line}", end="") - exttracted_version = line.removeprefix(PATTERN_V).strip() - if version != exttracted_version: - print( - f"FAILURE VERSION DOES NOT MATCH expected {version} got {exttracted_version}" - ) - - elif line.startswith(PATTERN_RD): - print(f"{line}", end="") - - shutil.rmtree(temp_dir) - - -def main(): - expected_builds = dict.fromkeys(PACKAGE_RELEASES, []) - - # Iterate over platform to gather build information of available conda version. - for package in PACKAGE_RELEASES: - for platform in PLATFORMS: - expected_builds[package] = expected_builds[ - package - ] + generate_expected_builds(platform, package, PACKAGE_RELEASES[package]) - - for package in PACKAGE_RELEASES: - count = 0 - for build in expected_builds[package]: - try: - s3.head_object(Bucket=S3_PYPI_STAGING, Key=build) - print(f"Validating filename {os.path.basename(build)}") - validate_file_metadata(build, package, PACKAGE_RELEASES[package]) - count += 1 - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "404": - print(f"FAILED 404 Error on {build}") - elif e.response["Error"]["Code"] == "403": - print(f"FAILED Unauthorized Error on {build}") - else: - print(f"Error on {build}") - print(f"Package Validated {count} for {package}") - - -if __name__ == "__main__": - main() diff --git a/common/install_conda.sh b/common/install_conda.sh deleted file mode 100644 index 6ae978f05..000000000 --- a/common/install_conda.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -ex - -# Anaconda -# Latest anaconda is using openssl-3 which is incompatible with all currently published versions of git -# Which are using openssl-1.1.1, see https://anaconda.org/anaconda/git/files?version=2.40.1 for example -MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh -wget -q $MINICONDA_URL -# NB: Manually invoke bash per https://github.com/conda/conda/issues/10431 -bash $(basename "$MINICONDA_URL") -b -p /opt/conda -rm $(basename "$MINICONDA_URL") -export PATH=/opt/conda/bin:$PATH -# See https://github.com/pytorch/builder/issues/1473 -# Pin conda to 23.5.2 as it's the last one compatible with openssl-1.1.1 -conda install -y conda=23.5.2 conda-build anaconda-client git ninja -# The cmake version here needs to match with the minimum version of cmake -# supported by PyTorch (3.18). There is only 3.18.2 on anaconda -/opt/conda/bin/pip3 install cmake==3.18.2 -conda remove -y --force patchelf diff --git a/common/install_patchelf.sh b/common/install_patchelf.sh deleted file mode 100644 index 37b69415e..000000000 --- a/common/install_patchelf.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -ex - -# Pin the version to latest release 0.17.2, building newer commit starts -# to fail on the current image -git clone -b 0.17.2 --single-branch https://github.com/NixOS/patchelf -cd patchelf -sed -i 's/serial/parallel/g' configure.ac -./bootstrap.sh -./configure -make -make install -cd .. -rm -rf patchelf