mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 05:28:13 +02:00
207 lines
8.7 KiB
Python
207 lines
8.7 KiB
Python
import pandas as pd
|
|
import argparse, os, docker
|
|
from tqdm import tqdm
|
|
import shutil
|
|
from datetime import datetime
|
|
|
|
from handlers import FailedToCompileError, FailedToTestError, NoTestsFoundError, NoTestResultsToExtractError, get_build_handler
|
|
from utils import clone
|
|
|
|
tqdm.pandas()
|
|
|
|
EXCLUSION_LIST = [
|
|
"edmcouncil/idmp", # requires authentication
|
|
"aosp-mirror/platform_frameworks_base", # takes ages to clone
|
|
"alibaba/druid", # tests takes literally more than 5 hours
|
|
"hashgraph/hedera-mirror-node", # requires authentication
|
|
"Starcloud-Cloud/starcloud-llmops", # requires authentication
|
|
]
|
|
|
|
def remove_dir(dir: str) -> None:
|
|
"""
|
|
Removes a directory and all its contents. Removes parent directorie if it is empty after removing child (dir).
|
|
|
|
Args:
|
|
dir (str): The directory to remove.
|
|
"""
|
|
shutil.rmtree(dir)
|
|
parent = os.path.abspath(os.path.join(dir, os.path.pardir))
|
|
if os.listdir(parent) == []:
|
|
shutil.rmtree(parent)
|
|
|
|
def process_row(repo, client, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
|
|
updates["good_repo_for_crab"] = False
|
|
updates["processed"] = True
|
|
with tqdm(total=5, leave=False) as pbar:
|
|
if repo in EXCLUSION_LIST:
|
|
updates["error_msg"] = "Repo in exclusion list"
|
|
if verbose: print(f"Skipping {repo}, in exclusion list")
|
|
return
|
|
|
|
pbar.set_postfix_str("Cloning...")
|
|
if force:
|
|
clone(repo, dest, updates, verbose=verbose)
|
|
pbar.update(1)
|
|
|
|
repo_path = os.path.join(dest, repo)
|
|
if not os.path.exists(repo_path):
|
|
updates["error_msg"] = "Repo not cloned"
|
|
return
|
|
|
|
pbar.set_postfix_str("Getting build handler...")
|
|
build_handler = get_build_handler(dest, repo, updates)
|
|
if build_handler is None:
|
|
if verbose: print(f"Removing {repo}, no build file")
|
|
remove_dir(repo_path)
|
|
return
|
|
pbar.update(1)
|
|
|
|
build_handler.set_client(client)
|
|
with build_handler:
|
|
try:
|
|
pbar.set_postfix_str("Checking for tests...")
|
|
build_handler.check_for_tests()
|
|
pbar.update(1)
|
|
|
|
pbar.set_postfix_str("Compiling...")
|
|
build_handler.compile_repo()
|
|
updates["compiled_successfully"] = True
|
|
pbar.update(1)
|
|
|
|
pbar.set_postfix_str("Running tests...")
|
|
build_handler.test_repo()
|
|
updates["tested_successfully"] = True
|
|
pbar.update(1)
|
|
|
|
build_handler.clean_repo()
|
|
|
|
# If repo was not removed, then it is a good repo
|
|
updates["good_repo_for_crab"] = True
|
|
except NoTestsFoundError as e:
|
|
updates["error_msg"] = str(e)
|
|
if verbose: print(f"Removing {repo}, error: no tests found")
|
|
remove_dir(repo_path)
|
|
return
|
|
except FailedToCompileError as e:
|
|
updates["error_msg"] = str(e)
|
|
updates["compiled_successfully"] = False
|
|
if verbose: print(f"Removing {repo}, error: failed to compile")
|
|
remove_dir(repo_path)
|
|
return
|
|
except FailedToTestError as e:
|
|
updates["error_msg"] = str(e)
|
|
updates["tested_successfully"] = False
|
|
if verbose: print(f"Removing {repo}, error: failed to run tests")
|
|
remove_dir(repo_path)
|
|
return
|
|
except NoTestResultsToExtractError as e:
|
|
updates["error_msg"] = str(e)
|
|
if verbose: print(f"Removing {repo}, error: failed to extract test results")
|
|
remove_dir(repo_path)
|
|
return
|
|
|
|
|
|
def save_df_with_updates(df, updates_list, results_file: str, verbose=False):
|
|
# Set the new data
|
|
for index, updates in updates_list:
|
|
for col, value in updates.items():
|
|
df.at[index, col] = value # Batch updates to avoid fragmentation
|
|
|
|
if verbose: print("Writing results...")
|
|
df.to_csv(results_file, index=False)
|
|
|
|
def process_repos(file: str, dest: str, results_file: str, /, lazy: bool = False, force: bool =False, verbose: bool = False) -> None:
|
|
"""
|
|
Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
|
|
|
|
|
|
Arguments:
|
|
file (str): The name of the file to download the repos from. Must be a .csv.gz file (downloaded from https://seart-ghs.si.usi.ch)
|
|
dest (str): The name of the root directory in which to download the repos
|
|
verbose (bool): If `True`, outputs detailed process information. Defaults to `False`.
|
|
"""
|
|
if verbose: print(f"Reading CSV file {file}")
|
|
df = pd.read_csv(file)
|
|
results_df = pd.read_csv(results_file) if lazy else None
|
|
|
|
# drop all columns besides the name
|
|
df = df[["name"]]
|
|
df = df.assign(
|
|
processed=False,
|
|
cloned_successfully=None,
|
|
build_system=None,
|
|
depth_of_build_file=None,
|
|
detected_source_of_tests=None,
|
|
compiled_successfully=None,
|
|
tested_successfully=None,
|
|
n_tests=None,
|
|
n_tests_with_grep=None,
|
|
n_tests_passed=None,
|
|
n_tests_failed=None,
|
|
n_tests_errors=None,
|
|
n_tests_skipped=None,
|
|
good_repo_for_crab=None,
|
|
error_msg=None,
|
|
)
|
|
|
|
updates_list = [] # Collect updates in a list
|
|
client = docker.from_env()
|
|
|
|
good_repos = 0
|
|
n_processed = 0
|
|
last_i_saved = -1
|
|
to_be_processed = df
|
|
if lazy and results_df is not None:
|
|
df = results_df.copy()
|
|
only_processed = results_df[results_df["processed"]]
|
|
good_repos = only_processed[only_processed["good_repo_for_crab"] == True]["good_repo_for_crab"].sum()
|
|
n_processed = len(only_processed)
|
|
last_i_saved = n_processed
|
|
to_be_processed = df.loc[~df["name"].isin(only_processed["name"])] # the .loc is to have a view of df and not to make a copy (a copy resets the index and we don't want that)
|
|
try:
|
|
if verbose: print("Processing repositories")
|
|
with tqdm(total=len(df)) as pbar:
|
|
pbar.update(n_processed)
|
|
for i, row in to_be_processed.iterrows():
|
|
if i % 10 == 0:
|
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
|
last_i_saved = i
|
|
pbar.set_postfix({
|
|
"repo": row["name"],
|
|
"last index saved": last_i_saved,
|
|
"# good repos": f"{good_repos} ({good_repos/n_processed if n_processed > 0 else 0:.2%})",
|
|
"time": datetime.now().strftime("%H:%M:%S")
|
|
})
|
|
updates = {}
|
|
updates_list.append((i, updates))
|
|
process_row(row["name"], client, dest, updates, force=force, verbose=verbose)
|
|
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
|
|
good_repos += 1
|
|
pbar.update(1)
|
|
n_processed += 1
|
|
except KeyboardInterrupt as e:
|
|
print("Interrupted by user, saving progress...")
|
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
|
raise e
|
|
except Exception as e:
|
|
print("An error occured, saving progress and then raising the error...")
|
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
|
raise e
|
|
|
|
if verbose: print("Saving results...")
|
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
|
|
|
if __name__ == "__main__":
|
|
# whtie the code to parse the arguments here
|
|
parser = argparse.ArgumentParser(description="Clone repos from a given file")
|
|
parser.add_argument("file", default="results.csv.gz", help="The file to download the repos from. Default is 'results.csv.gz'")
|
|
parser.add_argument("-d", "--dest", default="./results/", help="The root directory in which to download the repos. Default is './results/'")
|
|
parser.add_argument("-r", "--results", default="repos.csv", help="The name of file in which to save the results. Also used with --continue. Default is 'repos.csv'")
|
|
parser.add_argument("-l", "--lazy", action="store_true", help="If given, the program will continue from where it left off, by not touch the already processed repos. Will look at the file pointed by the --results argument")
|
|
parser.add_argument("-f", "--force", action="store_true", help="Force the download of the repos")
|
|
parser.add_argument("-v", "--verbose", action="store_true", help="Make the program verbose")
|
|
args = parser.parse_args()
|
|
|
|
process_repos(args.file, args.dest, args.results, lazy=args.lazy, force=args.force, verbose=args.verbose)
|
|
|