removed code pertaining to stats, now only relying

on cache
2025-10-13 19:58:02 +02:00 · 2025-03-26 09:05:49 +01:00
parent 4c56a352e7
commit 77c5e2bec0
1 changed files with 11 additions and 37 deletions
--- a/pull_requests.py
+++ b/pull_requests.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 import argparse, os, subprocess, docker
-from typing import Any, Callable, Optional
+from typing import Any, Callable
 from github.PullRequest import PullRequest
 from github.Repository import Repository
 import pandas as pd
@@ -33,29 +33,19 @@ def is_pull_good(pull: PullRequest, verbose: bool = False):
        and pull.user.type != "Bot"
    )

-def get_good_prs(repo: Repository, stats_df: Optional[pd.DataFrame], cache: dict[str, dict[int, DatasetEntry]] = {}) -> list[PullRequest]:
+def get_good_prs(repo: Repository, cache: dict[str, dict[int, DatasetEntry]] = {}) -> list[PullRequest]:
    good_prs = []
-    prs = repo.get_pulls(state="closed")

-    if stats_df is None or repo.full_name not in stats_df["repo"].unique():
-        potenially_good_prs = prs
-        number_of_prs = prs.totalCount
-        from_stats = False
-    else:
-        potenially_good_prs_numbers = stats_df.loc[(stats_df["repo"] == repo.full_name) & (stats_df["has_only_1_comment"] == True)]["pr_number"]
-        if repo.full_name in cache:
-            potenially_good_prs_numbers = [n for n in potenially_good_prs_numbers if n not in cache[repo.full_name]]
-        potenially_good_prs = [repo.get_pull(n) for n in tqdm(potenially_good_prs_numbers, desc=f"Getting good PRs from stats", leave=False)]
-        number_of_prs = len(potenially_good_prs)
-        from_stats = True
+    potenially_good_prs = repo.get_pulls(state="closed")
+    number_of_prs = potenially_good_prs.totalCount

    if number_of_prs == 0:
        return []

    with tqdm(total=number_of_prs, desc=f"Extracting good PRs from {repo.full_name}", leave=False) as pbar:
        for pr in potenially_good_prs:
-            pbar.set_postfix({"found": len(good_prs), "pr_number": pr.number, "from_stats": from_stats})
-            if pr.merged_at is None:
+            pbar.set_postfix({"new good found": len(good_prs), "pr_number": pr.number})
+            if pr.merged_at is None or pr.number in cache.get(repo.full_name, set()):
                pbar.update(1)
                continue
            if is_pull_good(pr):
@@ -187,25 +177,21 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir:
        entry.metadata.reason_for_failure = "" # was set to 'still processing', since it's done being processed and was successful, there are no reasons for failure
    dataset.to_json(args.output)

-
-def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset, repos_dir: str, cache: dict[str, dict[int, DatasetEntry]] = {}):
+def process_repo(repo_name: str, dataset: Dataset, repos_dir: str, cache: dict[str, dict[int, DatasetEntry]] = {}):
    repo = g.get_repo(repo_name)
    if repo.full_name in cache:
        dataset.entries.extend(cache[repo.full_name].values())
        dataset.to_json(args.output)

    good_prs = []
-    good_prs = get_good_prs(repo, stats_df, cache)
-
-    if repo_name in cache:
-        good_prs = [pr for pr in good_prs if pr.number not in cache[repo_name]]
+    good_prs = get_good_prs(repo, cache)

    with tqdm(good_prs, desc="Processing good prs", leave=False) as pbar:
        for pr in pbar:
            pbar.set_postfix({"pr": pr.number})
            process_pull(repo, pr, dataset, repos_dir, cache)

-def process_repos(df: pd.DataFrame, stats_df: Optional[pd.DataFrame], dataset: Dataset, repos_dir: str, cache: dict[str, dict[int, DatasetEntry]] = {}):
+def process_repos(df: pd.DataFrame, dataset: Dataset, repos_dir: str, cache: dict[str, dict[int, DatasetEntry]] = {}):
    """
    Processes the repos in the given csv file, extracting the good ones and
    creating the "triplets" for the dataset.
@@ -216,12 +202,6 @@ def process_repos(df: pd.DataFrame, stats_df: Optional[pd.DataFrame], dataset: D
        Passing it by reference in order have the latest information, in case of an error
    verbose (bool): Whether to be verbose or not
    """
-    already_processed_repos = []
-    potentially_good_repos = []
-    if stats_df is not None:
-        already_processed_repos = stats_df["repo"].unique()
-        potentially_good_repos = stats_df.loc[stats_df["has_only_1_comment"]]["repo"].unique()
-
    with tqdm(total=len(df), desc="Processing repos") as pbar:
        for _, row in df.iterrows():
            repo_name = row["name"]
@@ -231,10 +211,7 @@ def process_repos(df: pd.DataFrame, stats_df: Optional[pd.DataFrame], dataset: D
                "started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
                "# triplets": f"{len(dataset)}/{len(dataset.entries)} ({len(dataset)/len(dataset.entries) if len(dataset.entries) > 0 else 0:.2%})"
            })
-            if repo_name in already_processed_repos and repo_name not in potentially_good_repos:
-                pbar.update(1)
-                continue # skipping because we know there's nothing good already
-            process_repo(repo_name, stats_df, dataset, repos_dir, cache)
+            process_repo(repo_name, dataset, repos_dir, cache)
            pbar.update(1)


@@ -243,7 +220,6 @@ if __name__ == "__main__":
    parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).')
    parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"')
    parser.add_argument('-r', '--repos', type=str, default="./results/", help='The directory in which the repos were cloned (will be cloned if they aren\'t there already). Default: "./results/"')
-    parser.add_argument('-s', '--stats', type=str, help="The name of the output file from the stats_pull_requests.py. The stats file already knows which PRs are good (the ones with only 1 comment between two rounds of commits), so instead of going through all of PRs of a repo, we can fast-track using this. If the repo isn't in the stats file, we must go through each PR")
    parser.add_argument('-c', '--cache', type=str, help="The name of the output file from another run of this script. This is for when the script unexpectedly got interrupted and you want to resume from where you left off.")
    # parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.')
    parser.add_argument("--only-repo", type=str, help="If this argument is not provided, all the repos in the '--repos' csv will be processed. If instead you want to run the script on a single repo (for testing purposes mainly) provide a string of form 'XXX/YYY' to this argument, where XXX is the owner of the repo and YYY is the name of the repo")
@@ -264,11 +240,9 @@ if __name__ == "__main__":
        for cache_entry in cache_dataset.entries:
            cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry
        
-
-    stats_df = pd.read_csv(args.stats) if args.stats is not None else None
    dataset = Dataset()
    try:
        # try and finally to save, regardless of an error occuring or the program finished correctly
-        process_repos(df, stats_df, dataset, args.repos, cache)
+        process_repos(df, dataset, args.repos, cache)
    finally:
        dataset.to_json(args.output)