mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 05:28:13 +02:00
extracted a function, updated it, and using it in
pull_requests
This commit is contained in:
@ -1,11 +1,12 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import argparse, os, sys, subprocess, docker
|
import argparse, os, sys, docker
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import shutil
|
import shutil
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from handlers import GradleHandler, MavenHandler, BuildHandler
|
from handlers import GradleHandler, MavenHandler, BuildHandler
|
||||||
|
from utils import clone
|
||||||
|
|
||||||
tqdm.pandas()
|
tqdm.pandas()
|
||||||
|
|
||||||
@ -17,36 +18,6 @@ EXCLUSION_LIST = [
|
|||||||
"Starcloud-Cloud/starcloud-llmops", # requires authentication
|
"Starcloud-Cloud/starcloud-llmops", # requires authentication
|
||||||
]
|
]
|
||||||
|
|
||||||
def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
|
|
||||||
"""
|
|
||||||
Clones a GitHub repository into a local directory.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
repo (str): The repository to clone, in the format "owner/repo_name".
|
|
||||||
force (bool, optional): If `True`, re-clones the repository even if it already exists. Defaults to `False`.
|
|
||||||
"""
|
|
||||||
local_repo_path = os.path.join(dest, repo)
|
|
||||||
if not force and os.path.exists(local_repo_path):
|
|
||||||
# if verbose: print(f"Skipping {repo}, already exists")
|
|
||||||
updates["cloned_successfully"] = "Already exists"
|
|
||||||
return
|
|
||||||
|
|
||||||
if verbose: print(f"Cloning {repo}")
|
|
||||||
proc = subprocess.run(
|
|
||||||
["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE
|
|
||||||
)
|
|
||||||
if proc.returncode != 0:
|
|
||||||
updates["cloned_successfully"] = False
|
|
||||||
print(f"Failed to clone {repo}", file=sys.stderr)
|
|
||||||
print(f"Error message was:", file=sys.stderr)
|
|
||||||
error_msg = proc.stderr.decode()
|
|
||||||
print(error_msg, file=sys.stderr)
|
|
||||||
updates["error_msg"] = error_msg
|
|
||||||
else:
|
|
||||||
updates["cloned_successfully"] = True
|
|
||||||
|
|
||||||
def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
|
def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
|
||||||
"""
|
"""
|
||||||
Get the path to the build file of a repository. The build file is either a
|
Get the path to the build file of a repository. The build file is either a
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
|
import argparse, os, subprocess
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from github.PullRequest import PullRequest
|
from github.PullRequest import PullRequest
|
||||||
from github.Repository import Repository
|
from github.Repository import Repository
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import argparse, os
|
|
||||||
from github import Github
|
from github import Github
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -57,7 +57,7 @@ def get_good_prs(repo: Repository, stats_df: Optional[pd.DataFrame]) -> list[Pul
|
|||||||
|
|
||||||
return good_prs
|
return good_prs
|
||||||
|
|
||||||
def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
|
def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir: str):
|
||||||
commits = list(pr.get_commits())
|
commits = list(pr.get_commits())
|
||||||
if not commits:
|
if not commits:
|
||||||
return # No commits, skip processing
|
return # No commits, skip processing
|
||||||
@ -73,6 +73,7 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
|
|||||||
|
|
||||||
diffs_after = [Diff(file.filename, file.patch) for file in repo.compare(first_commit.sha, last_commit.sha).files]
|
diffs_after = [Diff(file.filename, file.patch) for file in repo.compare(first_commit.sha, last_commit.sha).files]
|
||||||
|
|
||||||
|
|
||||||
dataset.entries.append(DatasetEntry(
|
dataset.entries.append(DatasetEntry(
|
||||||
metadata=Metadata(repo.full_name, pr.number, pr.merge_commit_sha, True),
|
metadata=Metadata(repo.full_name, pr.number, pr.merge_commit_sha, True),
|
||||||
files=[FileData(file.filename) for file in pr.get_files()],
|
files=[FileData(file.filename) for file in pr.get_files()],
|
||||||
@ -81,15 +82,15 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset):
|
|||||||
diffs_after=diffs_after,
|
diffs_after=diffs_after,
|
||||||
))
|
))
|
||||||
|
|
||||||
def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset):
|
def process_repo(repo_name: str, stats_df: Optional[pd.DataFrame], dataset: Dataset, repos_dir: str):
|
||||||
good_prs = []
|
good_prs = []
|
||||||
repo = g.get_repo(repo_name)
|
repo = g.get_repo(repo_name)
|
||||||
good_prs = get_good_prs(repo, stats_df)
|
good_prs = get_good_prs(repo, stats_df)
|
||||||
|
|
||||||
for pr in tqdm(good_prs, desc="Processing good prs", leave=False):
|
for pr in tqdm(good_prs, desc="Processing good prs", leave=False):
|
||||||
process_pull(repo, pr, dataset)
|
process_pull(repo, pr, dataset, repos_dir)
|
||||||
|
|
||||||
def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset):
|
def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset, repos_dir: str):
|
||||||
"""
|
"""
|
||||||
Processes the repos in the given csv file, extracting the good ones and
|
Processes the repos in the given csv file, extracting the good ones and
|
||||||
creating the "triplets" for the dataset.
|
creating the "triplets" for the dataset.
|
||||||
@ -120,7 +121,7 @@ def process_repos(csv_file: str, stats_csv: Optional[str], dataset: Dataset):
|
|||||||
if repo_name in already_processed_repos and repo_name not in potentially_good_repos:
|
if repo_name in already_processed_repos and repo_name not in potentially_good_repos:
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
continue # skipping because we know there's nothing good already
|
continue # skipping because we know there's nothing good already
|
||||||
process_repo(repo_name, stats_df, dataset)
|
process_repo(repo_name, stats_df, dataset, repos_dir)
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
|
||||||
|
|
||||||
@ -128,6 +129,7 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.')
|
parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.')
|
||||||
parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).')
|
parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).')
|
||||||
parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"')
|
parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"')
|
||||||
|
parser.add_argument('-r', '--repos', type=str, default="./results/", help='The directory in which the repos were cloned (will be cloned if they aren\'t there already). Default: "./results/"')
|
||||||
parser.add_argument('-s', '--stats', type=str, help="The name of the output file from the stats_pull_requests.py. The stats file already knows which PRs are good (the ones with only 1 comment between two rounds of commits), so instead of going through all of PRs of a repo, we can fast-track using this. If the repo isn't in the stats file, we must go through each PR")
|
parser.add_argument('-s', '--stats', type=str, help="The name of the output file from the stats_pull_requests.py. The stats file already knows which PRs are good (the ones with only 1 comment between two rounds of commits), so instead of going through all of PRs of a repo, we can fast-track using this. If the repo isn't in the stats file, we must go through each PR")
|
||||||
# parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.')
|
# parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.')
|
||||||
|
|
||||||
@ -138,6 +140,6 @@ if __name__ == "__main__":
|
|||||||
dataset = Dataset()
|
dataset = Dataset()
|
||||||
try:
|
try:
|
||||||
# try and finally to save, regardless of an error occuring or the program finished correctly
|
# try and finally to save, regardless of an error occuring or the program finished correctly
|
||||||
process_repos(args.csv_file, args.stats, dataset)
|
process_repos(args.csv_file, args.stats, args.repos, dataset)
|
||||||
finally:
|
finally:
|
||||||
dataset.to_json(args.output)
|
dataset.to_json(args.output)
|
||||||
|
67
utils.py
67
utils.py
@ -1,9 +1,9 @@
|
|||||||
|
import os, sys, logging, subprocess
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from github.Commit import Commit
|
from github.Commit import Commit
|
||||||
from github.PaginatedList import PaginatedList
|
from github.PaginatedList import PaginatedList
|
||||||
from github.PullRequestComment import PullRequestComment
|
from github.PullRequestComment import PullRequestComment
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import logging
|
|
||||||
|
|
||||||
def move_github_logging_to_file():
|
def move_github_logging_to_file():
|
||||||
github_logger = logging.getLogger("github")
|
github_logger = logging.getLogger("github")
|
||||||
@ -77,3 +77,68 @@ def has_only_1_comment(commits: PaginatedList[Commit], comments: PaginatedList[P
|
|||||||
continue
|
continue
|
||||||
if verbose: print(f"n_before: {n_before}, n_after: {n_after}")
|
if verbose: print(f"n_before: {n_before}, n_after: {n_after}")
|
||||||
return n_before >= 1 and n_after >= 1
|
return n_before >= 1 and n_after >= 1
|
||||||
|
|
||||||
|
def is_already_repo_cloned(repos_dir: str, repo_name: str) -> bool:
|
||||||
|
"""
|
||||||
|
Checks if the repository is cloned locally and if its remote URL matches the expected GitHub repository URL.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
repos_dir (str): The directory where repositories are stored.
|
||||||
|
repo_name (str): The name of the repository.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the repository is correctly cloned, False otherwise.
|
||||||
|
"""
|
||||||
|
path = os.path.join(repos_dir, repo_name)
|
||||||
|
|
||||||
|
if not os.path.exists(path) or not os.path.isdir(path):
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "-C", path, "remote", "-v"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
|
||||||
|
remote_urls = result.stdout.splitlines()
|
||||||
|
expected_url = f"https://github.com/{repo_name}"
|
||||||
|
|
||||||
|
return any(expected_url in url for url in remote_urls)
|
||||||
|
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def clone(repo: str, dest: str, updates: dict = {}, force: bool = False, verbose: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
Clones a GitHub repository into a local directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo (str): The GitHub repository to clone, in the format "owner/repo".
|
||||||
|
dest (str): The directory to clone the repository into.
|
||||||
|
updates (dict, optional): A dictionary to store updates about the cloning process.
|
||||||
|
force (bool): Whether to force the cloning process, even if the repository already exists.
|
||||||
|
verbose (bool): Whether to print verbose output.
|
||||||
|
"""
|
||||||
|
local_repo_path = os.path.join(dest, repo)
|
||||||
|
if not force and is_already_repo_cloned(dest, repo):
|
||||||
|
# if verbose: print(f"Skipping {repo}, already exists")
|
||||||
|
updates["cloned_successfully"] = "Already exists"
|
||||||
|
return
|
||||||
|
|
||||||
|
if verbose: print(f"Cloning {repo}")
|
||||||
|
proc = subprocess.run(
|
||||||
|
["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE
|
||||||
|
)
|
||||||
|
if proc.returncode != 0:
|
||||||
|
updates["cloned_successfully"] = False
|
||||||
|
print(f"Failed to clone {repo}", file=sys.stderr)
|
||||||
|
print(f"Error message was:", file=sys.stderr)
|
||||||
|
error_msg = proc.stderr.decode()
|
||||||
|
print(error_msg, file=sys.stderr)
|
||||||
|
updates["error_msg"] = error_msg
|
||||||
|
else:
|
||||||
|
updates["cloned_successfully"] = True
|
||||||
|
Reference in New Issue
Block a user