extracted a function, updated it, and using it in

pull_requests
2025-07-05 05:28:13 +02:00 · 2025-03-14 14:38:24 +01:00
parent fd82ff5128
commit 4544922165
3 changed files with 77 additions and 39 deletions
--- a/clone_repos.py
+++ b/clone_repos.py
@ -1,11 +1,12 @@
 import pandas as pd
-import argparse, os, sys, subprocess, docker
+import argparse, os, sys, docker
 from tqdm import tqdm
 import shutil
 from typing import Optional
 from datetime import datetime
 from handlers import GradleHandler, MavenHandler, BuildHandler
 from utils import clone
 tqdm.pandas()
@ -17,36 +18,6 @@ EXCLUSION_LIST = [
    "Starcloud-Cloud/starcloud-llmops", # requires authentication
 ]
 def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
    """
    Clones a GitHub repository into a local directory.
    Args:
        repo (str): The repository to clone, in the format "owner/repo_name".
        force (bool, optional): If `True`, re-clones the repository even if it already exists. Defaults to `False`.
    """
    local_repo_path = os.path.join(dest, repo)
    if not force and os.path.exists(local_repo_path):
        # if verbose: print(f"Skipping {repo}, already exists")
        updates["cloned_successfully"] = "Already exists"
        return 
    if verbose: print(f"Cloning {repo}")
    proc = subprocess.run(
        ["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    if proc.returncode != 0:
        updates["cloned_successfully"] = False
        print(f"Failed to clone {repo}", file=sys.stderr)
        print(f"Error message was:", file=sys.stderr)
        error_msg = proc.stderr.decode()
        print(error_msg, file=sys.stderr)
        updates["error_msg"] = error_msg
    else:
        updates["cloned_successfully"] = True
 def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
    """
    Get the path to the build file of a repository. The build file is either a
--- a/pull_requests.py
+++ b/pull_requests.py
@ -1,8 +1,8 @@
 import argparse, os, subprocess
 from typing import Optional
 from github.PullRequest import PullRequest
 from github.Repository import Repository
 import pandas as pd
 import argparse, os
 from github import Github
 from tqdm import tqdm
 from datetime import datetime
@ -57,7 +57,7 @@ def get_good_prs(repo: Repository, stats_df: Optional[pd.DataFrame]) -> list[Pul
    return good_prs
-def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
+def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir: str):
    commits = list(pr.get_commits())
    if not commits:
        return  # No commits, skip processing
@ -73,6 +73,7 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
    diffs_after = [Diff(file.filename, file.patch) for file in repo.compare(first_commit.sha, last_commit.sha).files]
    dataset.entries.append(DatasetEntry(
        metadata=Metadata(repo.full_name, pr.number, pr.merge_commit_sha, True),
        files=[FileData(file.filename) for file in pr.get_files()],
@ -81,15 +82,15 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
        diffs_after=diffs_after,
    ))
-def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset):
+def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset, repos_dir: str):
    good_prs = []
    repo = g.get_repo(repo_name)
    good_prs = get_good_prs(repo, stats_df)
    for pr in tqdm(good_prs, desc="Processing good prs", leave=False):
-        process_pull(repo, pr, dataset)
+        process_pull(repo, pr, dataset, repos_dir)
-def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset):
+def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset, repos_dir: str):
    """
    Processes the repos in the given csv file, extracting the good ones and
    creating the "triplets" for the dataset.
@ -120,7 +121,7 @@ def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset):
            if repo_name in already_processed_repos and repo_name not in potentially_good_repos:
                pbar.update(1)
                continue # skipping because we know there's nothing good already
-            process_repo(repo_name, stats_df, dataset)
+            process_repo(repo_name, stats_df, dataset, repos_dir)
            pbar.update(1)
@ -128,6 +129,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.')
    parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).')
    parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"')
    parser.add_argument('-r', '--repos', type=str, default="./results/", help='The directory in which the repos were cloned (will be cloned if they aren\'t there already). Default: "./results/"')
    parser.add_argument('-s', '--stats', type=str, help="The name of the output file from the stats_pull_requests.py. The stats file already knows which PRs are good (the ones with only 1 comment between two rounds of commits), so instead of going through all of PRs of a repo, we can fast-track using this. If the repo isn't in the stats file, we must go through each PR")
    # parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.')
@ -138,6 +140,6 @@ if __name__ == "__main__":
    dataset = Dataset()
    try:
        # try and finally to save, regardless of an error occuring or the program finished correctly
-        process_repos(args.csv_file, args.stats, dataset)
+        process_repos(args.csv_file, args.stats, args.repos, dataset)
    finally:
        dataset.to_json(args.output)
--- a/utils.py
+++ b/utils.py
@ -1,9 +1,9 @@
 import os, sys, logging, subprocess
 from datetime import datetime
 from github.Commit import Commit
 from github.PaginatedList import PaginatedList
 from github.PullRequestComment import PullRequestComment
 from tqdm import tqdm
 import logging
 def move_github_logging_to_file():
    github_logger = logging.getLogger("github")
@ -77,3 +77,68 @@ def has_only_1_comment(commits: PaginatedList[Commit], comments: PaginatedList[P
            continue
    if verbose: print(f"n_before: {n_before}, n_after: {n_after}")
    return n_before >= 1 and n_after >= 1
 def is_already_repo_cloned(repos_dir: str, repo_name: str) -> bool:
    """
    Checks if the repository is cloned locally and if its remote URL matches the expected GitHub repository URL.
    Parameters:
    repos_dir (str): The directory where repositories are stored.
    repo_name (str): The name of the repository.
    Returns:
    bool: True if the repository is correctly cloned, False otherwise.
    """
    path = os.path.join(repos_dir, repo_name)
    if not os.path.exists(path) or not os.path.isdir(path):
        return False
    try:
        result = subprocess.run(
            ["git", "-C", path, "remote", "-v"],
            capture_output=True,
            text=True,
            check=True
        )
        remote_urls = result.stdout.splitlines()
        expected_url = f"https://github.com/{repo_name}"
        return any(expected_url in url for url in remote_urls)
    except subprocess.CalledProcessError:
        return False
 def clone(repo: str, dest: str, updates: dict = {}, force: bool = False, verbose: bool = False) -> None:
    """
    Clones a GitHub repository into a local directory.
    Args:
        repo (str): The GitHub repository to clone, in the format "owner/repo".
        dest (str): The directory to clone the repository into.
        updates (dict, optional): A dictionary to store updates about the cloning process.
        force (bool): Whether to force the cloning process, even if the repository already exists.
        verbose (bool): Whether to print verbose output.
    """
    local_repo_path = os.path.join(dest, repo)
    if not force and is_already_repo_cloned(dest, repo):
        # if verbose: print(f"Skipping {repo}, already exists")
        updates["cloned_successfully"] = "Already exists"
        return 
    if verbose: print(f"Cloning {repo}")
    proc = subprocess.run(
        ["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    if proc.returncode != 0:
        updates["cloned_successfully"] = False
        print(f"Failed to clone {repo}", file=sys.stderr)
        print(f"Error message was:", file=sys.stderr)
        error_msg = proc.stderr.decode()
        print(error_msg, file=sys.stderr)
        updates["error_msg"] = error_msg
    else:
        updates["cloned_successfully"] = True