now using the PyGithub library, so that it handles

for me the issue of too many requests in too little time
2025-10-14 04:08:01 +02:00 · 2025-03-06 16:36:53 +01:00
parent 57b0f0c2cd
commit 9fa7dd53af
2 changed files with 38 additions and 54 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ tqdm
 docker
 beautifulsoup4
 unidiff
+PyGithub
--- a/stats_pull_requests.py
+++ b/stats_pull_requests.py
@@ -1,35 +1,22 @@
-import os, requests
+import os
 from datetime import datetime
 import pandas as pd
 import tqdm
+from github import Github

-COMMON_HEADERS = {
-    'Accept': 'application/vnd.github+json',
-    'Authorization': f'Bearer {os.environ["GITHUB_AUTH_TOKEN_CRAB"]}',
-    'X-Github-Api-Version': '2022-11-28',
-}
+# Initialize GitHub API client
+g = Github(os.environ["GITHUB_AUTH_TOKEN_CRAB"])

-def github_call(url, params = {}):
-    result = requests.get(url, headers=COMMON_HEADERS, params=params)
-    if result.status_code != 200:
-        raise Exception(f"Failed to fetch {url}: {result.status_code}, {result = }")
-    return result
-
-def get_pulls(repo_url: str) -> list[dict]:
-    response = github_call(f'{repo_url}/pulls', params={"state": "all"})
-    return response.json()
-
-def has_only_1_round_of_comments(commits: list[dict], comments: list[dict]) -> bool:
-    if len(comments) == 0 or len(commits) == 0:
+def has_only_1_round_of_comments(commits, comments):
+    if not comments or not commits:
        return False
    
-    # Convert timestamps to datetime objects for easy comparison
-    commit_dates = [datetime.fromisoformat(c["commit"]["author"]["date"]) for c in commits]
-    comment_dates = [datetime.fromisoformat(c["created_at"]) for c in comments]
+    commit_dates = [c.commit.author.date for c in commits]
+    comment_dates = [c.created_at for c in comments]
+    
    commit_dates.sort()
    comment_dates.sort()
    
-    # Identify the first and last comment times
    first_comment_time = comment_dates[0]
    last_comment_time = comment_dates[-1]
    
@@ -39,47 +26,43 @@ def has_only_1_round_of_comments(commits: list[dict], comments: list[dict]) -> b
    
    return True

-
-def process_pull(repo_name: str, pull_number: str) -> dict:
-    pull = github_call(f"https://api.github.com/repos/{repo_name}/pulls/{pull_number}").json()
-    commits = github_call(f"https://api.github.com/repos/{repo_name}/pulls/{pull_number}/commits").json()
-    comments = github_call(f"https://api.github.com/repos/{repo_name}/pulls/{pull_number}/comments").json()
+def process_pull(repo, pull):
+    commits = list(pull.get_commits())
+    comments = list(pull.get_review_comments())
    
    return {
-        "repo": repo_name,
-        "pr_number": pull["number"],
-        "additions": pull["additions"],
-        "deletions": pull["deletions"],
-        "changed_files": pull["changed_files"],
+        "repo": repo.full_name,
+        "pr_number": pull.number,
+        "additions": pull.additions,
+        "deletions": pull.deletions,
+        "changed_files": pull.changed_files,
        "has_only_1_round_of_comments": has_only_1_round_of_comments(commits, comments),
        "has_only_1_comment": len(comments) == 1,
    }

-def process_repo(repo_name: str) -> list[dict]:
+def process_repo(repo_name):
+    repo = g.get_repo(repo_name)
    stats = []
-    pulls = get_pulls(f"https://api.github.com/repos/{repo_name}")
-    for pull in tqdm.tqdm(pulls, desc=repo_name, leave=False):
-        if "merged_at" not in pull or pull["merged_at"] is None:
+    
+    for pull in tqdm.tqdm(list(repo.get_pulls(state="closed")), desc=repo_name, leave=False):
+        if not pull.merged_at:
            continue
        
-        stats.append(process_pull(repo_name, pull["number"]))
+        stats.append(process_pull(repo, pull))
    return stats

-
 def main():
    repos = pd.read_csv("results.csv")
    repos = repos[repos["good_repo_for_crab"] == True]
-    print(len(repos))
    stats = []
    
-    for _, row in tqdm.tqdm(repos.iterrows(), total=len(repos)):
-        if "name" not in row or not isinstance(row["name"], str):
-            continue
-        name = row["name"]
-        stats.extend(process_repo(name))
-
-    pd.DataFrame(stats).to_csv("pr_stats.csv", index=False)
-
+    try:
+        for _, row in tqdm.tqdm(repos.iterrows(), total=len(repos)):
+            if "name" not in row or not isinstance(row["name"], str):
+                continue
+            stats.extend(process_repo(row["name"]))
+    finally:
+        pd.DataFrame(stats).to_csv("pr_stats.csv", index=False)

 if __name__ == "__main__":
    main()