added small .unique() on repo names to avoid

processing a repo twice
This commit is contained in:
Karma Riuk
2025-06-05 10:45:01 +02:00
parent bf1591c61d
commit 4c5e486ad6

View File

@ -554,7 +554,7 @@ def process_repos_parallel(
dataset.to_json(args.output) dataset.to_json(args.output)
print("Done") print("Done")
repo_names = [repo_name for repo_name in df["name"] if repo_name not in EXCLUSION_LIST] repo_names = [repo_name for repo_name in df["name"].unique() if repo_name not in EXCLUSION_LIST]
free_positions = list(range(1, n_workers + 1)) free_positions = list(range(1, n_workers + 1))
repo_names_iter = iter(repo_names) repo_names_iter = iter(repo_names)
future_to_repo: dict[Future, tuple[str, int]] = {} future_to_repo: dict[Future, tuple[str, int]] = {}
@ -642,9 +642,9 @@ def process_repos(
dataset.entries.extend(pr2entry.values()) dataset.entries.extend(pr2entry.values())
dataset.to_json(args.output) dataset.to_json(args.output)
with tqdm(total=len(df), desc="Processing repos", unit="repo") as pbar: repo_names = df["name"].unique()
for _, row in df.iterrows(): with tqdm(total=len(repo_names), desc="Processing repos", unit="repo") as pbar:
repo_name = row["name"] for _, repo_name in df.iterrows():
assert isinstance(repo_name, str) assert isinstance(repo_name, str)
if repo_name in EXCLUSION_LIST: if repo_name in EXCLUSION_LIST:
pbar.update(1) pbar.update(1)