From 14e64984c55f37e902c0dc52bcaf36a13a33bc5b Mon Sep 17 00:00:00 2001 From: Karma Riuk Date: Fri, 16 May 2025 19:39:59 +0200 Subject: [PATCH] instead of adding the cache as we go through the repos, just add it before any processing, so we are sure to keep all the previously saved data --- pull_requests.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pull_requests.py b/pull_requests.py index 2664602..ee5dde1 100644 --- a/pull_requests.py +++ b/pull_requests.py @@ -382,9 +382,7 @@ def process_repo( repo = g.get_repo(repo_name) already_seen_prs = set() if repo.full_name in cache: - dataset.entries.extend(cache[repo.full_name].values()) already_seen_prs = set(cache[repo.full_name].keys()) - dataset.to_json(args.output) prs = repo.get_pulls(state="closed") @@ -422,6 +420,10 @@ def process_repos( Passing it by reference in order have the latest information, in case of an error verbose (bool): Whether to be verbose or not """ + for pr2entry in tqdm(cache.values(), desc="Adding cache in dataset"): + dataset.entries.extend(pr2entry.values()) + dataset.to_json(args.output) + with tqdm(total=len(df), desc="Processing repos") as pbar: for _, row in df.iterrows(): repo_name = row["name"]