Skip to content

Commit eee5bf6

Browse files
committed
Skip large diffs and limit clone concurrency
1 parent b2e5681 commit eee5bf6

4 files changed

Lines changed: 25 additions & 14 deletions

File tree

‎vfc_datasets/config.py‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def _env_path(name: str, default: Path) -> Path:
4242

4343
# Git Operations
4444
GIT_CLONE_TIMEOUT = _env_int("GIT_CLONE_TIMEOUT", 3600, minimum=1) # 1 hour default
45+
MAX_CLONE_WORKERS = _env_int("MAX_CLONE_WORKERS", 4, minimum=1)
4546

4647
# Clone strategy: repos with >= this many commits to enrich get a full clone
4748
FULL_CLONE_THRESHOLD = _env_int("FULL_CLONE_THRESHOLD", 100, minimum=1)

‎vfc_datasets/transformations/enrichment/add_commit_data_local.py‎

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@ def _get_commit_info(repo: Repo, commit_id: str, max_diff_size: int) -> CommitDa
2121
"""Get commit info from an open Repo object."""
2222
try:
2323
commit = repo.commit(commit_id)
24-
diff_text = get_commit_diff(repo, commit_id)
2524

26-
# Check diff size limit
27-
diff_data: str | None = diff_text
28-
if len(diff_text) > max_diff_size:
29-
diff_data = None
25+
# Note: Using line count as a safe fast-path heuristic for max_diff_size (chars).
26+
if commit.stats.total["lines"] > max_diff_size:
27+
return None
28+
29+
diff_data: str | None = None
30+
diff_text = get_commit_diff(repo, commit_id)
31+
if len(diff_text) <= max_diff_size:
32+
diff_data = diff_text
3033

3134
return CommitData(
3235
message=str(commit.message),

‎vfc_datasets/transformations/enrichment/add_no_comment.py‎

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,13 +232,14 @@ def _process_diff_item(diff_item: Any, repo: Repo, include_unsupported: bool) ->
232232

233233

234234
def _get_diff_no_comments(
235-
repo: Repo, commit_id: str, include_unsupported: bool = True
235+
repo: Repo, commit_id: str, max_diff_size: int, include_unsupported: bool = True
236236
) -> str | None:
237237
"""Generate diff with comments stripped.
238238
239239
Args:
240240
repo: Git repository object
241241
commit_id: Commit SHA to process
242+
max_diff_size: Maximum diff size (heuristic)
242243
include_unsupported: If True, include original diff for unsupported files.
243244
If False, skip unsupported files entirely.
244245
"""
@@ -247,6 +248,10 @@ def _get_diff_no_comments(
247248
if not commit.parents:
248249
return None
249250

251+
# Note: Using line count as a safe fast-path heuristic for max_diff_size (chars).
252+
if commit.stats.total["lines"] > max_diff_size:
253+
return None
254+
250255
diffs = commit.parents[0].diff(commit, create_patch=True)
251256
file_diffs: list[str] = []
252257

@@ -261,15 +266,17 @@ def _get_diff_no_comments(
261266
return None
262267

263268

264-
def _process_batch(args: tuple[str, list[str], bool]) -> dict[str, str]:
269+
def _process_batch(args: tuple[str, list[str], int, bool]) -> dict[str, str]:
265270
"""Process a batch of commits."""
266-
repo_path, commit_ids, include_unsupported = args
271+
repo_path, commit_ids, max_diff_size, include_unsupported = args
267272
results: dict[str, str] = {}
268273

269274
try:
270275
with Repo(repo_path) as repository:
271276
for commit_id in commit_ids:
272-
diff = _get_diff_no_comments(repository, commit_id, include_unsupported)
277+
diff = _get_diff_no_comments(
278+
repository, commit_id, max_diff_size, include_unsupported
279+
)
273280
if diff is not None:
274281
results[commit_id] = diff
275282
except Exception as e:
@@ -313,6 +320,6 @@ def strip_diff_comments(
313320
),
314321
batch_fn=_process_batch,
315322
apply_fn=_apply_diff,
316-
batch_extra_args=(include_unsupported,),
323+
batch_extra_args=(MAX_DIFF_SIZE, include_unsupported),
317324
desc="Stripping comments",
318325
)

‎vfc_datasets/utils/git/repository.py‎

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
import multiprocessing
55
import shutil
66
from collections.abc import Iterable
7-
from concurrent.futures import ProcessPoolExecutor, as_completed
7+
from concurrent.futures import ThreadPoolExecutor, as_completed
88
from pathlib import Path
99
from urllib.parse import urlparse
1010

1111
from git import GitCommandError, InvalidGitRepositoryError, Repo
1212
from tqdm.auto import tqdm
1313

14-
from vfc_datasets.config import GIT_CLONE_TIMEOUT
14+
from vfc_datasets.config import GIT_CLONE_TIMEOUT, MAX_CLONE_WORKERS
1515
from vfc_datasets.dataset_entry import DatasetEntry
1616

1717
from .url import url_to_pathname
@@ -269,7 +269,7 @@ def clone_repositories(
269269
timeout = GIT_CLONE_TIMEOUT
270270

271271
if max_workers is None:
272-
max_workers = min(multiprocessing.cpu_count(), 16, len(project_urls))
272+
max_workers = min(MAX_CLONE_WORKERS, len(project_urls))
273273

274274
def _strategy_for(url: str) -> CloneStrategy:
275275
if isinstance(strategy, dict):
@@ -279,7 +279,7 @@ def _strategy_for(url: str) -> CloneStrategy:
279279
results = {}
280280
failed_urls = []
281281

282-
with ProcessPoolExecutor(max_workers=max_workers) as executor:
282+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
283283
future_to_url = {
284284
executor.submit(
285285
clone_repository,

0 commit comments

Comments
 (0)