first clean version of storing the entries of the

dataset, still need to check for tests
2025-10-13 19:58:02 +02:00 · 2025-03-14 12:05:19 +01:00
parent 468f793ded
commit 6437efaa54
1 changed files with 140 additions and 0 deletions
--- a/pull_requests.py
+++ b/pull_requests.py
@@ -0,0 +1,140 @@
+from typing import Optional
+from github.PullRequest import PullRequest
+from github.Repository import Repository
+import pandas as pd
+import argparse, os
+from github import Github
+from tqdm import tqdm
+from datetime import datetime
+
+from dataset import Dataset, DatasetEntry, FileData, Metadata, Diff
+from utils import has_only_1_comment
+
+
+def get_good_projects(csv_file: str, verbose: bool = False) -> pd.DataFrame:
+    """
+    Extracts the good (the ones that compile and test successfully, and that
+    have at least one test) from the given file.
+    
+    Parameters:
+    csv_file (str): The csv file containing the projects.
+    verbose (bool): Whether to print the number of good projects.
+
+    Returns:
+    pd.DataFrame: The good projects.
+    """
+    if verbose: print(f"Reading {csv_file}...")
+    df = pd.read_csv(csv_file)
+    return df.loc[(df['good_repo_for_crab'] == True) & (df['n_tests'] > 0)]
+
+def is_pull_good(pull: PullRequest, verbose: bool = False):
+    return has_only_1_comment(pull.get_commits(), pull.get_review_comments(), verbose=verbose)
+
+def get_good_prs(repo: Repository, stats_df: Optional[pd.DataFrame]) -> list[PullRequest]:
+    good_prs = []
+    prs = repo.get_pulls(state="closed")
+
+    if stats_df is None:
+        potenially_good_prs = prs
+        number_of_prs = prs.totalCount
+    else:
+        potenially_good_prs_numbers = stats_df.loc[(stats_df["repo"] == repo.full_name) & (stats_df["has_only_1_comment"] == True)]["pr_number"]
+        potenially_good_prs = [repo.get_pull(n) for n in potenially_good_prs_numbers]
+        number_of_prs = len(potenially_good_prs)
+
+
+    with tqdm(total=number_of_prs, desc=f"Extracting good PRs from {repo.full_name}", leave=False) as pbar:
+        for pr in potenially_good_prs:
+            pbar.set_postfix({"found": len(good_prs), "pr_number": pr.number})
+            if pr.merged_at is None:
+                pbar.update(1)
+                continue
+            if is_pull_good(pr):
+                good_prs.append(pr)
+            pbar.update(1)
+
+    return good_prs
+
+def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
+    commits = list(pr.get_commits())
+    if not commits:
+        return  # No commits, skip processing
+
+    first_commit = commits[0]
+    last_commit = commits[-1]
+
+    diffs_before = [Diff(file.filename, file.patch) for file in repo.compare(pr.base.sha, first_commit.sha).files]
+
+    comments = list(pr.get_review_comments())
+    assert len(comments) == 1
+    comment_text = comments[0].body if comments else ""
+
+    diffs_after = [Diff(file.filename, file.patch) for file in repo.compare(first_commit.sha, last_commit.sha).files]
+
+    dataset.entries.append(DatasetEntry(
+        metadata=Metadata(repo.full_name, pr.number, pr.merge_commit_sha, True),
+        files=[FileData(file.filename) for file in pr.get_files()],
+        diffs_before=diffs_before,
+        comment=comment_text,
+        diffs_after=diffs_after,
+    ))
+
+def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset, verbose: bool = False):
+    good_prs = []
+    repo = g.get_repo(repo_name)
+    good_prs = get_good_prs(repo, stats_df)
+
+    for pr in tqdm(good_prs, desc="Processing good prs", leave=False):
+        process_pull(repo, pr, dataset)
+
+def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset, verbose: bool = False):
+    """
+    Processes the repos in the given csv file, extracting the good ones and
+    creating the "triplets" for the dataset.
+
+    Parameters:
+    csv_file (str): The csv file containing the projects.
+    dataset (Dataset): The dataset in which the triplets will be stored.
+        Passing it by reference in order have the latest information, in case of an error
+    verbose (bool): Whether to be verbose or not
+    """
+    df = get_good_projects(csv_file, verbose=verbose)
+    stats_df = pd.read_csv(stats_csv) if stats_csv is not None else None
+    already_processed_repos = []
+    potentially_good_repos = []
+    if stats_df is not None:
+        already_processed_repos = stats_df["repo"].unique()
+        potentially_good_repos = stats_df.loc[stats_df["has_only_1_comment"]]["repo"].unique()
+
+    with tqdm(total=len(df), desc="Processing repos") as pbar:
+        for _, row in df.iterrows():
+            repo_name = row["name"]
+            assert isinstance(repo_name, str)
+            pbar.set_postfix({
+                "repo": repo_name, 
+                "started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
+                "# triplets": len(dataset)
+            })
+            if repo_name in already_processed_repos and repo_name not in potentially_good_repos:
+                pbar.update(1)
+                continue # skipping because we know there's nothing good already
+            process_repo(repo_name, stats_df, dataset)
+            pbar.update(1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.')
+    parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).')
+    parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"')
+    parser.add_argument('-s', '--stats', type=str, help="The name of the output file from the stats_pull_requests.py. The stats file already knows which PRs are good (the ones with only 1 comment between two rounds of commits), so instead of going through all of PRs of a repo, we can fast-track using this. If the repo isn't in the stats file, we must go through each PR")
+    parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.')
+
+    args = parser.parse_args()
+    g = Github(os.environ["GITHUB_AUTH_TOKEN_CRAB"])
+
+    dataset = Dataset()
+    try:
+        # try and finally to save, regardless of an error occuring or the program finished correctly
+        process_repos(args.csv_file, args.stats, dataset, verbose=args.verbose)
+    finally:
+        dataset.to_json(args.output)