extracted a function, updated it, and using it in

pull_requests
2025-08-21 02:27:53 +02:00 · 2025-03-14 14:38:24 +01:00
parent fd82ff5128
commit 4544922165
3 changed files with 77 additions and 39 deletions
--- a/clone_repos.py
+++ b/clone_repos.py
@@ -1,11 +1,12 @@
 import pandas as pd
-import argparse, os, sys, subprocess, docker
+import argparse, os, sys, docker
 from tqdm import tqdm
 import shutil
 from typing import Optional
 from datetime import datetime

 from handlers import GradleHandler, MavenHandler, BuildHandler
+from utils import clone

 tqdm.pandas()

@@ -17,36 +18,6 @@ EXCLUSION_LIST = [
    "Starcloud-Cloud/starcloud-llmops", # requires authentication
 ]

-def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
-    """
-    Clones a GitHub repository into a local directory.
-
-    Args:
-        repo (str): The repository to clone, in the format "owner/repo_name".
-        force (bool, optional): If `True`, re-clones the repository even if it already exists. Defaults to `False`.
-    """
-    local_repo_path = os.path.join(dest, repo)
-    if not force and os.path.exists(local_repo_path):
-        # if verbose: print(f"Skipping {repo}, already exists")
-        updates["cloned_successfully"] = "Already exists"
-        return 
-
-    if verbose: print(f"Cloning {repo}")
-    proc = subprocess.run(
-        ["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE
-    )
-    if proc.returncode != 0:
-        updates["cloned_successfully"] = False
-        print(f"Failed to clone {repo}", file=sys.stderr)
-        print(f"Error message was:", file=sys.stderr)
-        error_msg = proc.stderr.decode()
-        print(error_msg, file=sys.stderr)
-        updates["error_msg"] = error_msg
-    else:
-        updates["cloned_successfully"] = True
-
 def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
    """
    Get the path to the build file of a repository. The build file is either a
--- a/pull_requests.py
+++ b/pull_requests.py
@@ -1,8 +1,8 @@
+import argparse, os, subprocess
 from typing import Optional
 from github.PullRequest import PullRequest
 from github.Repository import Repository
 import pandas as pd
-import argparse, os
 from github import Github
 from tqdm import tqdm
 from datetime import datetime
@@ -57,7 +57,7 @@ def get_good_prs(repo: Repository, stats_df: Optional[pd.DataFrame]) -> list[Pul

    return good_prs

-def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
+def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir: str):
    commits = list(pr.get_commits())
    if not commits:
        return  # No commits, skip processing
@@ -73,6 +73,7 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):

    diffs_after = [Diff(file.filename, file.patch) for file in repo.compare(first_commit.sha, last_commit.sha).files]

+
    dataset.entries.append(DatasetEntry(
        metadata=Metadata(repo.full_name, pr.number, pr.merge_commit_sha, True),
        files=[FileData(file.filename) for file in pr.get_files()],
@@ -81,15 +82,15 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
        diffs_after=diffs_after,
    ))

-def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset):
+def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset, repos_dir: str):
    good_prs = []
    repo = g.get_repo(repo_name)
    good_prs = get_good_prs(repo, stats_df)

    for pr in tqdm(good_prs, desc="Processing good prs", leave=False):
-        process_pull(repo, pr, dataset)
+        process_pull(repo, pr, dataset, repos_dir)

-def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset):
+def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset, repos_dir: str):
    """
    Processes the repos in the given csv file, extracting the good ones and
    creating the "triplets" for the dataset.
@@ -120,7 +121,7 @@ def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset):
            if repo_name in already_processed_repos and repo_name not in potentially_good_repos:
                pbar.update(1)
                continue # skipping because we know there's nothing good already
-            process_repo(repo_name, stats_df, dataset)
+            process_repo(repo_name, stats_df, dataset, repos_dir)
            pbar.update(1)


@@ -128,6 +129,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.')
    parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).')
    parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"')
+    parser.add_argument('-r', '--repos', type=str, default="./results/", help='The directory in which the repos were cloned (will be cloned if they aren\'t there already). Default: "./results/"')
    parser.add_argument('-s', '--stats', type=str, help="The name of the output file from the stats_pull_requests.py. The stats file already knows which PRs are good (the ones with only 1 comment between two rounds of commits), so instead of going through all of PRs of a repo, we can fast-track using this. If the repo isn't in the stats file, we must go through each PR")
    # parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.')

@@ -138,6 +140,6 @@ if __name__ == "__main__":
    dataset = Dataset()
    try:
        # try and finally to save, regardless of an error occuring or the program finished correctly
-        process_repos(args.csv_file, args.stats, dataset)
+        process_repos(args.csv_file, args.stats, args.repos, dataset)
    finally:
        dataset.to_json(args.output)
--- a/utils.py
+++ b/utils.py
@@ -1,9 +1,9 @@
+import os, sys, logging, subprocess
 from datetime import datetime
 from github.Commit import Commit
 from github.PaginatedList import PaginatedList
 from github.PullRequestComment import PullRequestComment
 from tqdm import tqdm
-import logging

 def move_github_logging_to_file():
    github_logger = logging.getLogger("github")
@@ -77,3 +77,68 @@ def has_only_1_comment(commits: PaginatedList[Commit], comments: PaginatedList[P
            continue
    if verbose: print(f"n_before: {n_before}, n_after: {n_after}")
    return n_before >= 1 and n_after >= 1
+
+def is_already_repo_cloned(repos_dir: str, repo_name: str) -> bool:
+    """
+    Checks if the repository is cloned locally and if its remote URL matches the expected GitHub repository URL.
+
+    Parameters:
+    repos_dir (str): The directory where repositories are stored.
+    repo_name (str): The name of the repository.
+
+    Returns:
+    bool: True if the repository is correctly cloned, False otherwise.
+    """
+    path = os.path.join(repos_dir, repo_name)
+
+    if not os.path.exists(path) or not os.path.isdir(path):
+        return False
+
+    try:
+        result = subprocess.run(
+            ["git", "-C", path, "remote", "-v"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+
+        remote_urls = result.stdout.splitlines()
+        expected_url = f"https://github.com/{repo_name}"
+
+        return any(expected_url in url for url in remote_urls)
+
+    except subprocess.CalledProcessError:
+        return False
+
+def clone(repo: str, dest: str, updates: dict = {}, force: bool = False, verbose: bool = False) -> None:
+    """
+    Clones a GitHub repository into a local directory.
+
+    Args:
+        repo (str): The GitHub repository to clone, in the format "owner/repo".
+        dest (str): The directory to clone the repository into.
+        updates (dict, optional): A dictionary to store updates about the cloning process.
+        force (bool): Whether to force the cloning process, even if the repository already exists.
+        verbose (bool): Whether to print verbose output.
+    """
+    local_repo_path = os.path.join(dest, repo)
+    if not force and is_already_repo_cloned(dest, repo):
+        # if verbose: print(f"Skipping {repo}, already exists")
+        updates["cloned_successfully"] = "Already exists"
+        return 
+
+    if verbose: print(f"Cloning {repo}")
+    proc = subprocess.run(
+        ["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    if proc.returncode != 0:
+        updates["cloned_successfully"] = False
+        print(f"Failed to clone {repo}", file=sys.stderr)
+        print(f"Error message was:", file=sys.stderr)
+        error_msg = proc.stderr.decode()
+        print(error_msg, file=sys.stderr)
+        updates["error_msg"] = error_msg
+    else:
+        updates["cloned_successfully"] = True