Skip to content

Commit 90c2ed5

Browse files
J03D03claude
andcommitted
Replace blobless clone with size-limited partial clone
Rename CloneStrategy.BLOBLESS to PARTIAL and switch the git filter from blob:none to blob:limit=$BLOB_SIZE_LIMIT (default 1m). Small blobs come down with the initial clone, so enrichment avoids a round-trip per file while still skipping oversized blobs. The limit is configurable via the BLOB_SIZE_LIMIT env var. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent af859ab commit 90c2ed5

4 files changed

Lines changed: 19 additions & 16 deletions

File tree

‎tests/enrichment/test_batch_processing.py‎

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66

77

88
class TestPickCloneStrategy:
9-
def test_below_threshold_is_blobless(self):
9+
def test_below_threshold_is_partial(self):
1010
commits = {"https://github.com/a/a": {"c1", "c2", "c3"}}
1111
assert _pick_clone_strategy(commits, threshold=10) == {
12-
"https://github.com/a/a": CloneStrategy.BLOBLESS
12+
"https://github.com/a/a": CloneStrategy.PARTIAL
1313
}
1414

1515
def test_at_threshold_is_full(self):
@@ -24,7 +24,7 @@ def test_mixed_repos(self):
2424
"https://github.com/big/repo": {f"c{i}" for i in range(50)},
2525
}
2626
result = _pick_clone_strategy(commits, threshold=10)
27-
assert result["https://github.com/small/repo"] is CloneStrategy.BLOBLESS
27+
assert result["https://github.com/small/repo"] is CloneStrategy.PARTIAL
2828
assert result["https://github.com/big/repo"] is CloneStrategy.FULL
2929

3030
def test_default_threshold_matches_config(self):
@@ -34,7 +34,7 @@ def test_default_threshold_matches_config(self):
3434
}
3535
result = _pick_clone_strategy(commits)
3636
assert result["https://github.com/a/a"] is CloneStrategy.FULL
37-
assert result["https://github.com/b/b"] is CloneStrategy.BLOBLESS
37+
assert result["https://github.com/b/b"] is CloneStrategy.PARTIAL
3838

3939
def test_empty_input(self):
4040
assert _pick_clone_strategy({}) == {}

‎vfc_datasets/config.py‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ def _env_path(name: str, default: Path) -> Path:
4747
# Clone strategy: repos with >= this many commits to enrich get a full clone
4848
FULL_CLONE_THRESHOLD = _env_int("FULL_CLONE_THRESHOLD", 100, minimum=1)
4949

50+
# Partial clone size filter. Git accepts suffixes (k/m/g). Blobs above this
51+
# size are omitted during clone and fetched lazily on access.
52+
BLOB_SIZE_LIMIT = os.getenv("BLOB_SIZE_LIMIT", "1m")
53+
5054
# Dataset Caching
5155
USE_DATASET_CACHE = os.getenv("USE_DATASET_CACHE", "true").lower() in ("true", "yes")
5256

‎vfc_datasets/transformations/enrichment/batch_processing.py‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ def _pick_clone_strategy(
2222
commits_by_url: dict[str, set[str]],
2323
threshold: int = FULL_CLONE_THRESHOLD,
2424
) -> dict[str, CloneStrategy]:
25-
"""Pick FULL for repos with >= threshold commits to enrich, else BLOBLESS."""
25+
"""Pick FULL for repos with >= threshold commits to enrich, else PARTIAL."""
2626
return {
27-
url: (CloneStrategy.FULL if len(cids) >= threshold else CloneStrategy.BLOBLESS)
27+
url: (CloneStrategy.FULL if len(cids) >= threshold else CloneStrategy.PARTIAL)
2828
for url, cids in commits_by_url.items()
2929
}
3030

‎vfc_datasets/utils/git/repository.py‎

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import enum
22
import errno
33
import logging
4-
import multiprocessing
54
import shutil
65
from collections.abc import Iterable
76
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -11,7 +10,7 @@
1110
from git import GitCommandError, InvalidGitRepositoryError, Repo
1211
from tqdm.auto import tqdm
1312

14-
from vfc_datasets.config import GIT_CLONE_TIMEOUT, MAX_CLONE_WORKERS
13+
from vfc_datasets.config import BLOB_SIZE_LIMIT, GIT_CLONE_TIMEOUT, MAX_CLONE_WORKERS
1514
from vfc_datasets.dataset_entry import DatasetEntry
1615

1716
from .url import url_to_pathname
@@ -22,11 +21,11 @@
2221
class CloneStrategy(enum.Enum):
2322
"""How much of a repository to fetch.
2423
25-
BLOBLESS: partial clone, on-demand blob fetch. Cheap for few commits/repo.
24+
PARTIAL: partial clone filtered by BLOB_SIZE_LIMIT; large blobs fetched on demand.
2625
FULL: all objects local. Cheaper once commits/repo exceeds FULL_CLONE_THRESHOLD.
2726
"""
2827

29-
BLOBLESS = "blobless"
28+
PARTIAL = "partial"
3029
FULL = "full"
3130

3231

@@ -188,8 +187,8 @@ def _clone_new_repo(
188187
"-c",
189188
"http.lowSpeedTime=600", # 10 min before timeout
190189
]
191-
if strategy is CloneStrategy.BLOBLESS:
192-
clone_options.extend(["--filter", "blob:none"])
190+
if strategy is CloneStrategy.PARTIAL:
191+
clone_options.extend(["--filter", f"blob:limit={BLOB_SIZE_LIMIT}"])
193192

194193
try:
195194
repo = Repo.clone_from(
@@ -225,7 +224,7 @@ def clone_repository(
225224
git_url: str,
226225
branch: str | None = None,
227226
timeout: int | None = None,
228-
strategy: CloneStrategy = CloneStrategy.BLOBLESS,
227+
strategy: CloneStrategy = CloneStrategy.PARTIAL,
229228
) -> Repo | None:
230229
"""Clone or reuse a git repository with a simplified interface."""
231230
git_url = (git_url or "").strip()
@@ -251,13 +250,13 @@ def clone_repositories(
251250
max_workers: int | None = None,
252251
branch: str | None = None,
253252
timeout: int | None = None,
254-
strategy: CloneStrategy | dict[str, CloneStrategy] = CloneStrategy.BLOBLESS,
253+
strategy: CloneStrategy | dict[str, CloneStrategy] = CloneStrategy.PARTIAL,
255254
) -> dict[str, Repo | None]:
256255
"""Clone all repositories from dataset entries using parallel processing.
257256
258257
``strategy`` may be a single CloneStrategy applied to every URL, or a dict
259258
mapping project_url -> CloneStrategy. URLs missing from the dict fall back
260-
to BLOBLESS.
259+
to PARTIAL.
261260
"""
262261
project_urls = {entry.project_url for entry in entries if entry.project_url}
263262

@@ -273,7 +272,7 @@ def clone_repositories(
273272

274273
def _strategy_for(url: str) -> CloneStrategy:
275274
if isinstance(strategy, dict):
276-
return strategy.get(url, CloneStrategy.BLOBLESS)
275+
return strategy.get(url, CloneStrategy.PARTIAL)
277276
return strategy
278277

279278
results = {}

0 commit comments

Comments
 (0)