From 45449221657c45e2143df68386a5fb5e9a469e67 Mon Sep 17 00:00:00 2001 From: Karma Riuk Date: Fri, 14 Mar 2025 14:38:24 +0100 Subject: [PATCH] extracted a function, updated it, and using it in pull_requests --- clone_repos.py | 33 ++---------------------- pull_requests.py | 16 +++++++----- utils.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 39 deletions(-) diff --git a/clone_repos.py b/clone_repos.py index b0ec1d5..ffffdef 100644 --- a/clone_repos.py +++ b/clone_repos.py @@ -1,11 +1,12 @@ import pandas as pd -import argparse, os, sys, subprocess, docker +import argparse, os, sys, docker from tqdm import tqdm import shutil from typing import Optional from datetime import datetime from handlers import GradleHandler, MavenHandler, BuildHandler +from utils import clone tqdm.pandas() @@ -17,36 +18,6 @@ EXCLUSION_LIST = [ "Starcloud-Cloud/starcloud-llmops", # requires authentication ] -def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None: - """ - Clones a GitHub repository into a local directory. - - Args: - repo (str): The repository to clone, in the format "owner/repo_name". - force (bool, optional): If `True`, re-clones the repository even if it already exists. Defaults to `False`. - """ - local_repo_path = os.path.join(dest, repo) - if not force and os.path.exists(local_repo_path): - # if verbose: print(f"Skipping {repo}, already exists") - updates["cloned_successfully"] = "Already exists" - return - - if verbose: print(f"Cloning {repo}") - proc = subprocess.run( - ["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - if proc.returncode != 0: - updates["cloned_successfully"] = False - print(f"Failed to clone {repo}", file=sys.stderr) - print(f"Error message was:", file=sys.stderr) - error_msg = proc.stderr.decode() - print(error_msg, file=sys.stderr) - updates["error_msg"] = error_msg - else: - updates["cloned_successfully"] = True - def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]: """ Get the path to the build file of a repository. The build file is either a diff --git a/pull_requests.py b/pull_requests.py index 0c3717d..83177bd 100644 --- a/pull_requests.py +++ b/pull_requests.py @@ -1,8 +1,8 @@ +import argparse, os, subprocess from typing import Optional from github.PullRequest import PullRequest from github.Repository import Repository import pandas as pd -import argparse, os from github import Github from tqdm import tqdm from datetime import datetime @@ -57,7 +57,7 @@ def get_good_prs(repo: Repository, stats_df: Optional[pd.DataFrame]) -> list[Pul return good_prs -def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset): +def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir: str): commits = list(pr.get_commits()) if not commits: return # No commits, skip processing @@ -73,6 +73,7 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset): diffs_after = [Diff(file.filename, file.patch) for file in repo.compare(first_commit.sha, last_commit.sha).files] + dataset.entries.append(DatasetEntry( metadata=Metadata(repo.full_name, pr.number, pr.merge_commit_sha, True), files=[FileData(file.filename) for file in pr.get_files()], @@ -81,15 +82,15 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset): diffs_after=diffs_after, )) -def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset): +def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset, repos_dir: str): good_prs = [] repo = g.get_repo(repo_name) good_prs = get_good_prs(repo, stats_df) for pr in tqdm(good_prs, desc="Processing good prs", leave=False): - process_pull(repo, pr, dataset) + process_pull(repo, pr, dataset, repos_dir) -def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset): +def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset, repos_dir: str): """ Processes the repos in the given csv file, extracting the good ones and creating the "triplets" for the dataset. @@ -120,7 +121,7 @@ def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset): if repo_name in already_processed_repos and repo_name not in potentially_good_repos: pbar.update(1) continue # skipping because we know there's nothing good already - process_repo(repo_name, stats_df, dataset) + process_repo(repo_name, stats_df, dataset, repos_dir) pbar.update(1) @@ -128,6 +129,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.') parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).') parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"') + parser.add_argument('-r', '--repos', type=str, default="./results/", help='The directory in which the repos were cloned (will be cloned if they aren\'t there already). Default: "./results/"') parser.add_argument('-s', '--stats', type=str, help="The name of the output file from the stats_pull_requests.py. The stats file already knows which PRs are good (the ones with only 1 comment between two rounds of commits), so instead of going through all of PRs of a repo, we can fast-track using this. If the repo isn't in the stats file, we must go through each PR") # parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.') @@ -138,6 +140,6 @@ if __name__ == "__main__": dataset = Dataset() try: # try and finally to save, regardless of an error occuring or the program finished correctly - process_repos(args.csv_file, args.stats, dataset) + process_repos(args.csv_file, args.stats, args.repos, dataset) finally: dataset.to_json(args.output) diff --git a/utils.py b/utils.py index 1ef5890..321090e 100644 --- a/utils.py +++ b/utils.py @@ -1,9 +1,9 @@ +import os, sys, logging, subprocess from datetime import datetime from github.Commit import Commit from github.PaginatedList import PaginatedList from github.PullRequestComment import PullRequestComment from tqdm import tqdm -import logging def move_github_logging_to_file(): github_logger = logging.getLogger("github") @@ -77,3 +77,68 @@ def has_only_1_comment(commits: PaginatedList[Commit], comments: PaginatedList[P continue if verbose: print(f"n_before: {n_before}, n_after: {n_after}") return n_before >= 1 and n_after >= 1 + +def is_already_repo_cloned(repos_dir: str, repo_name: str) -> bool: + """ + Checks if the repository is cloned locally and if its remote URL matches the expected GitHub repository URL. + + Parameters: + repos_dir (str): The directory where repositories are stored. + repo_name (str): The name of the repository. + + Returns: + bool: True if the repository is correctly cloned, False otherwise. + """ + path = os.path.join(repos_dir, repo_name) + + if not os.path.exists(path) or not os.path.isdir(path): + return False + + try: + result = subprocess.run( + ["git", "-C", path, "remote", "-v"], + capture_output=True, + text=True, + check=True + ) + + remote_urls = result.stdout.splitlines() + expected_url = f"https://github.com/{repo_name}" + + return any(expected_url in url for url in remote_urls) + + except subprocess.CalledProcessError: + return False + +def clone(repo: str, dest: str, updates: dict = {}, force: bool = False, verbose: bool = False) -> None: + """ + Clones a GitHub repository into a local directory. + + Args: + repo (str): The GitHub repository to clone, in the format "owner/repo". + dest (str): The directory to clone the repository into. + updates (dict, optional): A dictionary to store updates about the cloning process. + force (bool): Whether to force the cloning process, even if the repository already exists. + verbose (bool): Whether to print verbose output. + """ + local_repo_path = os.path.join(dest, repo) + if not force and is_already_repo_cloned(dest, repo): + # if verbose: print(f"Skipping {repo}, already exists") + updates["cloned_successfully"] = "Already exists" + return + + if verbose: print(f"Cloning {repo}") + proc = subprocess.run( + ["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + if proc.returncode != 0: + updates["cloned_successfully"] = False + print(f"Failed to clone {repo}", file=sys.stderr) + print(f"Error message was:", file=sys.stderr) + error_msg = proc.stderr.decode() + print(error_msg, file=sys.stderr) + updates["error_msg"] = error_msg + else: + updates["cloned_successfully"] = True