Files
crab/clone_repos.py
Karma Riuk 082f6c6f71 using expections to interrupt the flow of
execution instead of having to propagate a boolean
2025-03-14 16:10:54 +01:00

207 lines
8.7 KiB
Python

import pandas as pd
import argparse, os, docker
from tqdm import tqdm
import shutil
from datetime import datetime
from handlers import FailedToCompileError, FailedToTestError, NoTestsFoundError, NoTestResultsToExtractError, get_build_handler
from utils import clone
tqdm.pandas()
EXCLUSION_LIST = [
"edmcouncil/idmp", # requires authentication
"aosp-mirror/platform_frameworks_base", # takes ages to clone
"alibaba/druid", # tests takes literally more than 5 hours
"hashgraph/hedera-mirror-node", # requires authentication
"Starcloud-Cloud/starcloud-llmops", # requires authentication
]
def remove_dir(dir: str) -> None:
"""
Removes a directory and all its contents. Removes parent directorie if it is empty after removing child (dir).
Args:
dir (str): The directory to remove.
"""
shutil.rmtree(dir)
parent = os.path.abspath(os.path.join(dir, os.path.pardir))
if os.listdir(parent) == []:
shutil.rmtree(parent)
def process_row(repo, client, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
updates["good_repo_for_crab"] = False
updates["processed"] = True
with tqdm(total=5, leave=False) as pbar:
if repo in EXCLUSION_LIST:
updates["error_msg"] = "Repo in exclusion list"
if verbose: print(f"Skipping {repo}, in exclusion list")
return
pbar.set_postfix_str("Cloning...")
if force:
clone(repo, dest, updates, verbose=verbose)
pbar.update(1)
repo_path = os.path.join(dest, repo)
if not os.path.exists(repo_path):
updates["error_msg"] = "Repo not cloned"
return
pbar.set_postfix_str("Getting build handler...")
build_handler = get_build_handler(dest, repo, updates)
if build_handler is None:
if verbose: print(f"Removing {repo}, no build file")
remove_dir(repo_path)
return
pbar.update(1)
build_handler.set_client(client)
with build_handler:
try:
pbar.set_postfix_str("Checking for tests...")
build_handler.check_for_tests()
pbar.update(1)
pbar.set_postfix_str("Compiling...")
build_handler.compile_repo()
updates["compiled_successfully"] = True
pbar.update(1)
pbar.set_postfix_str("Running tests...")
build_handler.test_repo()
updates["tested_successfully"] = True
pbar.update(1)
build_handler.clean_repo()
# If repo was not removed, then it is a good repo
updates["good_repo_for_crab"] = True
except NoTestsFoundError as e:
updates["error_msg"] = str(e)
if verbose: print(f"Removing {repo}, error: no tests found")
remove_dir(repo_path)
return
except FailedToCompileError as e:
updates["error_msg"] = str(e)
updates["compiled_successfully"] = False
if verbose: print(f"Removing {repo}, error: failed to compile")
remove_dir(repo_path)
return
except FailedToTestError as e:
updates["error_msg"] = str(e)
updates["tested_successfully"] = False
if verbose: print(f"Removing {repo}, error: failed to run tests")
remove_dir(repo_path)
return
except NoTestResultsToExtractError as e:
updates["error_msg"] = str(e)
if verbose: print(f"Removing {repo}, error: failed to extract test results")
remove_dir(repo_path)
return
def save_df_with_updates(df, updates_list, results_file: str, verbose=False):
# Set the new data
for index, updates in updates_list:
for col, value in updates.items():
df.at[index, col] = value # Batch updates to avoid fragmentation
if verbose: print("Writing results...")
df.to_csv(results_file, index=False)
def process_repos(file: str, dest: str, results_file: str, /, lazy: bool = False, force: bool =False, verbose: bool = False) -> None:
"""
Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
Arguments:
file (str): The name of the file to download the repos from. Must be a .csv.gz file (downloaded from https://seart-ghs.si.usi.ch)
dest (str): The name of the root directory in which to download the repos
verbose (bool): If `True`, outputs detailed process information. Defaults to `False`.
"""
if verbose: print(f"Reading CSV file {file}")
df = pd.read_csv(file)
results_df = pd.read_csv(results_file) if lazy else None
# drop all columns besides the name
df = df[["name"]]
df = df.assign(
processed=False,
cloned_successfully=None,
build_system=None,
depth_of_build_file=None,
detected_source_of_tests=None,
compiled_successfully=None,
tested_successfully=None,
n_tests=None,
n_tests_with_grep=None,
n_tests_passed=None,
n_tests_failed=None,
n_tests_errors=None,
n_tests_skipped=None,
good_repo_for_crab=None,
error_msg=None,
)
updates_list = [] # Collect updates in a list
client = docker.from_env()
good_repos = 0
n_processed = 0
last_i_saved = -1
to_be_processed = df
if lazy and results_df is not None:
df = results_df.copy()
only_processed = results_df[results_df["processed"]]
good_repos = only_processed[only_processed["good_repo_for_crab"] == True]["good_repo_for_crab"].sum()
n_processed = len(only_processed)
last_i_saved = n_processed
to_be_processed = df.loc[~df["name"].isin(only_processed["name"])] # the .loc is to have a view of df and not to make a copy (a copy resets the index and we don't want that)
try:
if verbose: print("Processing repositories")
with tqdm(total=len(df)) as pbar:
pbar.update(n_processed)
for i, row in to_be_processed.iterrows():
if i % 10 == 0:
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
last_i_saved = i
pbar.set_postfix({
"repo": row["name"],
"last index saved": last_i_saved,
"# good repos": f"{good_repos} ({good_repos/n_processed if n_processed > 0 else 0:.2%})",
"time": datetime.now().strftime("%H:%M:%S")
})
updates = {}
updates_list.append((i, updates))
process_row(row["name"], client, dest, updates, force=force, verbose=verbose)
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
good_repos += 1
pbar.update(1)
n_processed += 1
except KeyboardInterrupt as e:
print("Interrupted by user, saving progress...")
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
raise e
except Exception as e:
print("An error occured, saving progress and then raising the error...")
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
raise e
if verbose: print("Saving results...")
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
if __name__ == "__main__":
# whtie the code to parse the arguments here
parser = argparse.ArgumentParser(description="Clone repos from a given file")
parser.add_argument("file", default="results.csv.gz", help="The file to download the repos from. Default is 'results.csv.gz'")
parser.add_argument("-d", "--dest", default="./results/", help="The root directory in which to download the repos. Default is './results/'")
parser.add_argument("-r", "--results", default="repos.csv", help="The name of file in which to save the results. Also used with --continue. Default is 'repos.csv'")
parser.add_argument("-l", "--lazy", action="store_true", help="If given, the program will continue from where it left off, by not touch the already processed repos. Will look at the file pointed by the --results argument")
parser.add_argument("-f", "--force", action="store_true", help="Force the download of the repos")
parser.add_argument("-v", "--verbose", action="store_true", help="Make the program verbose")
args = parser.parse_args()
process_repos(args.file, args.dest, args.results, lazy=args.lazy, force=args.force, verbose=args.verbose)