instead of making two passes through the PRs, we

just do one, and process it if it's good
This commit is contained in:
Karma Riuk
2025-03-26 13:42:41 +01:00
parent 9a48413ead
commit b02f08452d

View File

@ -35,34 +35,6 @@ def is_pull_good(pull: PullRequest, verbose: bool = False):
) )
def get_good_prs(
repo: Repository, cache: dict[str, dict[int, DatasetEntry]] = {}
) -> list[PullRequest]:
good_prs = []
potenially_good_prs = repo.get_pulls(state="closed")
number_of_prs = potenially_good_prs.totalCount
if number_of_prs == 0:
return []
with tqdm(
total=number_of_prs,
desc=f"Extracting good PRs from {repo.full_name}",
leave=False,
) as pbar:
for pr in potenially_good_prs:
pbar.set_postfix({"new good found": len(good_prs), "pr_number": pr.number})
if pr.merged_at is None or pr.number in cache.get(repo.full_name, set()):
pbar.update(1)
continue
if is_pull_good(pr):
good_prs.append(pr)
pbar.update(1)
return good_prs
def run_git_cmd(cmd: list[str], repo_path: str) -> subprocess.CompletedProcess: def run_git_cmd(cmd: list[str], repo_path: str) -> subprocess.CompletedProcess:
return subprocess.run( return subprocess.run(
["git", "-C", repo_path] + cmd, ["git", "-C", repo_path] + cmd,
@ -238,12 +210,17 @@ def process_repo(
dataset.entries.extend(cache[repo.full_name].values()) dataset.entries.extend(cache[repo.full_name].values())
dataset.to_json(args.output) dataset.to_json(args.output)
good_prs = [] prs = repo.get_pulls(state="closed")
good_prs = get_good_prs(repo, cache)
with tqdm(good_prs, desc="Processing good prs", leave=False) as pbar: n_good_prs = 0
with tqdm(prs, desc="Processing prs", leave=False) as pbar:
for pr in pbar: for pr in pbar:
pbar.set_postfix({"pr": pr.number}) pbar.set_postfix({"pr": pr.number, "# new good found": n_good_prs})
if pr.merged_at is None:
pbar.update(1)
continue
if is_pull_good(pr):
n_good_prs += 1
process_pull(repo, pr, dataset, repos_dir, cache) process_pull(repo, pr, dataset, repos_dir, cache)