From 14e64984c55f37e902c0dc52bcaf36a13a33bc5b Mon Sep 17 00:00:00 2001
From: Karma Riuk <riukkarma@gmail.com>
Date: Fri, 16 May 2025 19:39:59 +0200
Subject: [PATCH] instead of adding the cache as we go through the repos, just
 add it before any processing, so we are sure to keep all the previously saved
 data

---
 pull_requests.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pull_requests.py b/pull_requests.py
index 2664602..ee5dde1 100644
--- a/pull_requests.py
+++ b/pull_requests.py
@@ -382,9 +382,7 @@ def process_repo(
     repo = g.get_repo(repo_name)
     already_seen_prs = set()
     if repo.full_name in cache:
-        dataset.entries.extend(cache[repo.full_name].values())
         already_seen_prs = set(cache[repo.full_name].keys())
-        dataset.to_json(args.output)
 
     prs = repo.get_pulls(state="closed")
 
@@ -422,6 +420,10 @@ def process_repos(
         Passing it by reference in order have the latest information, in case of an error
     verbose (bool): Whether to be verbose or not
     """
+    for pr2entry in tqdm(cache.values(), desc="Adding cache in dataset"):
+        dataset.entries.extend(pr2entry.values())
+    dataset.to_json(args.output)
+
     with tqdm(total=len(df), desc="Processing repos") as pbar:
         for _, row in df.iterrows():
             repo_name = row["name"]