-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
execute batch downloads in parallel worker threads #12923
base: main
Are you sure you want to change the base?
Conversation
9d7ca27
to
1f25bf5
Compare
1f25bf5
to
af99bae
Compare
Progress output now looks really nice! |
f42fb29
to
65c461f
Compare
For example: # delete wheel cache and http download cache
> rm -rf ~/.cache/pip/{http{,-v2},wheels}/
> time PYTHONPATH="$(readlink -f src)" python -m pip install --progress-bar on --batch-download-parallelism 10 --dry-run --ignore-installed --report test.json --use-feature=fast-deps 'numpy>=1.19.5' 'keras==2.11.0' 'mtcnn' 'pillow>=7.0.0' 'bleach>=2.1.0' 'tensorflow-gpu==2.11.0'
# ...
╭─────────────────────────────────────────── Download Progress ────────────────────────────────────────────╮
│ total downloads - ━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20% 9/46 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭────────────────────────────────────── Individual Request Progress ───────────────────────────────────────╮
│total bytes ╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2% 13.4/734.1 MB 5.5 MB/s eta 0:02:12│
│tensorflow_gpu-2.11.... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 1.8/588.3 MB 805.1 kB/s eta 0:12:09│
│numpy-2.1.0-cp310-cp... ━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11% 1.8/16.3 MB 820.8 kB/s eta 0:00:18│
│mtcnn-0.1.1-py3-none... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━ 81% 1.8/2.3 MB 869.6 kB/s eta 0:00:01│
│pillow-10.4.0-cp310-... ━━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 29% 1.3/4.5 MB 563.7 kB/s eta 0:00:06│
│grpcio-1.65.5-cp310-... ━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14% 0.8/5.7 MB 411.9 kB/s eta 0:00:12│
│h5py-3.11.0-cp310-cp... ━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15% 0.8/5.3 MB 411.0 kB/s eta 0:00:11│
│libclang-18.1.1-py2.... ━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3% 0.8/24.5 MB 460.1 kB/s eta 0:00:52│
│opencv_python-4.10.0... ╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1% 0.8/62.5 MB 431.8 kB/s eta 0:02:24│
│protobuf-3.19.6-cp31... ━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25% 0.3/1.1 MB ? eta -:--:--│
│tensorboard-2.11.2-p... ━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17% 1.0/6.0 MB 1.0 MB/s eta 0:00:05│
│setuptools-73.0.0-py... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/2.3 MB ? eta -:--:--│
│tensorflow_estimator... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/439.2 kB ? eta -:--:--│
│tensorflow_io_gcs_fi... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/5.1 MB ? eta -:--:--│
│termcolor-2.4.0-py3-... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/7.7 kB ? eta -:--:--│
│typing_extensions-4.... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/37.4 kB ? eta -:--:--│
│wrapt-1.16.0-cp310-c... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/80.3 kB ? eta -:--:--│
│packaging-24.1-py3-n... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/54.0 kB ? eta -:--:--│
│webencodings-0.5.1-p... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/11.8 kB ? eta -:--:--│
│google_auth-2.34.0-p... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/200.9 kB ? eta -:--:--│
│google_auth_oauthlib... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/18.3 kB ? eta -:--:--│
│Markdown-3.7-py3-non... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/106.3 kB ? eta -:--:--│
│requests-2.32.3-py3-... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/64.9 kB ? eta -:--:--│
│tensorboard_data_ser... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/4.9 MB ? eta -:--:--│
│tensorboard_plugin_w... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/781.3 kB ? eta -:--:--│
│werkzeug-3.0.3-py3-n... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/227.3 kB ? eta -:--:--│
│wheel-0.44.0-py3-non... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/67.1 kB ? eta -:--:--│
│cachetools-5.5.0-py3... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/9.5 kB ? eta -:--:--│
│certifi-2024.7.4-py3... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/163.0 kB ? eta -:--:--│
│charset_normalizer-3... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/142.1 kB ? eta -:--:--│
│idna-3.7-py3-none-an... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/66.8 kB ? eta -:--:--│
│MarkupSafe-2.1.5-cp3... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/25.8 kB ? eta -:--:--│
│pyasn1_modules-0.4.0... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/181.2 kB ? eta -:--:--│
│requests_oauthlib-2.... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/24.2 kB ? eta -:--:--│
│rsa-4.9-py3-none-any... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/34.3 kB ? eta -:--:--│
│urllib3-2.2.2-py3-no... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/121.4 kB ? eta -:--:--│
│oauthlib-3.2.2-py3-n... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/151.7 kB ? eta -:--:--│
│pyasn1-0.6.0-py2.py3... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0.0/85.3 kB ? eta -:--:--│
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ |
e5d75de
to
0cb3256
Compare
It turns out that batched progress output actually looks super neat! But it also ended up being a lot of extra code that we can probably decouple from the download parallelism (see "Part A" and "Part B" in the PR description). I'm going to try to separate these two components into different PRs to make them easier to review. |
0cb3256
to
4687d11
Compare
5cc1d0f
to
481e1c4
Compare
Ok, I split out #12925 for the pooled progress output so we can review the progress reporting (which is about presentation of terminal output) separately from the parallelism (which is about performance and tricky communication/semaphore mechanisms). |
- use more specific types for BatchDownloader#__call__ - calculate byte lengths with a HEAD request - quiet all progress output from -q - don't write colored output with --no-color - write a lot more documentation for the new progress bar logic - use ProgressBarType enum for --progress-bar CLI flag
- limit downloads to 10 at a time instead of starting all at once - add cli arg to limit download parallelism - factor out receiving thread exceptions into a contextmanager - default batch parallelism to 10 - make batch download parallelism 1 in html index test - explicitly yield threads to help ensure correct download ordering
481e1c4
to
6eeb26f
Compare
This is probably ready to review as well, but I'm trying to minimize the number of PRs I have open at once, so I'm keeping it drafted until #12925 is in. |
Why not use concurrent.futures.ThreadPoolExecutor? |
if error_flag.is_set(): | ||
return | ||
try: | ||
resp = _http_get_download(session, link) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be worth getting the raw socket from the request so the main thread can close it on a KeyboardInterrupt
This PR is based off of #12925. See the
+215/-46
diff against it at cosmicexplorer/pip@batched-progress-bar...batch-download.Problem
See #825. With the metadata resolve work from
fast-deps
, PEP 658 support, and more (see #12921), we have developed the ability to download wheels in a batch at once after the metadata-only resolve has completed.Solution
threading.Thread
.threading.Semaphore
to reuse ourPipSession
connection pool.--batch-download-parallelism
option has been added todownload
,install
, andwheel
to limit parallelism as needed.queue.Queue
andyield
into an iterator to prepare requirements in parallel with downloads.KeyboardInterrupt
) with athreading.Event
and gracefully shut down worker threads.Result
tensorflow-gpu
) before starting our other downloads!