diff --git a/clone_repos.py b/clone_repos.py index 049a94b..c17dc69 100644 --- a/clone_repos.py +++ b/clone_repos.py @@ -9,7 +9,7 @@ EXCLUSION_LIST = [ "edmcouncil/idmp", ] -def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bool = False) -> None: +def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None: """ Clones a GitHub repository into a local directory. @@ -28,16 +28,16 @@ def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bo stderr=subprocess.PIPE ) if proc.returncode != 0: - row["successfully_cloned"] = False + updates["successfully_cloned"] = False print(f"Failed to clone {repo}", file=sys.stderr) print(f"Error message was:", file=sys.stderr) error_msg = proc.stderr.decode() print(error_msg, file=sys.stderr) - row["error_msg"] = error_msg + updates["error_msg"] = error_msg else: - row["successfully_cloned"] = True + updates["successfully_cloned"] = True -def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False): +def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False): """ Get the path to the build file of a repository. The build file is either a `pom.xml`, `build.gradle`, or `build.xml` file. @@ -54,14 +54,14 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False): if not os.path.isdir(path): error_msg = f"The path {path} is not a valid directory." print(error_msg, file=sys.stderr) - row["error_msg"] = error_msg + updates["error_msg"] = error_msg return None to_keep = ["pom.xml", "build.gradle", "build.xml"] for entry in os.scandir(path): if entry.is_file() and entry.name in to_keep: if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning") - row["depth_of_build_file"] = 0 + updates["depth_of_build_file"] = 0 return os.path.join(path, entry.name) # List files in the immediate subdirectories @@ -70,24 +70,24 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False): for sub_entry in os.scandir(entry.path): if sub_entry.is_file() and sub_entry.name in to_keep: if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning") - row["depth_of_build_file"] = 1 + updates["depth_of_build_file"] = 1 return os.path.join(path, entry.name, sub_entry.name) - row["error_msg"] = "No build file found" + updates["error_msg"] = "No build file found" return None -def has_tests(path: str, build_file: str, row: pd.Series) -> bool: +def has_tests(path: str, build_file: str, updates: dict) -> bool: with open(build_file, "r") as f: content = f.read() for library in ["junit", "testng", "mockito"]: if library in content: - row["detected_source_of_tests"] = library + " library in build file" + updates["detected_source_of_tests"] = library + " library in build file" return True for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]: if keyword in content: - row["detected_source_of_tests"] = keyword + " keyword in build file" + updates["detected_source_of_tests"] = keyword + " keyword in build file" return False test_dirs = [ @@ -98,10 +98,10 @@ def has_tests(path: str, build_file: str, row: pd.Series) -> bool: ] for td in test_dirs: if os.path.exists(os.path.join(path, td)): - row["detected_source_of_tests"] = td + " dir exists in repo" + updates["detected_source_of_tests"] = td + " dir exists in repo" return True - row["error_msg"] = "No tests found" + updates["error_msg"] = "No tests found" return False def remove_dir(dir: str) -> None: @@ -117,37 +117,47 @@ def remove_dir(dir: str) -> None: shutil.rmtree(parent) -def process_row(row, dest: str, force: bool = False, verbose: bool = False): - repo = row["name"] - if repo in EXCLUSION_LIST: - row["error_msg"] = "Repo in exclusion list" - if verbose: print(f"Skipping {repo}, in exclusion list") - return +def process_row(row, dest: str, force: bool = False, verbose: bool = False) -> dict: + updates = {} # Dictionary to store updates + with tqdm(total=3, leave=False) as pbar: + repo = row["name"] + if repo in EXCLUSION_LIST: + updates["error_msg"] = "Repo in exclusion list" + if verbose: print(f"Skipping {repo}, in exclusion list") + return updates - if force: - clone(repo, dest, row, verbose=verbose) + pbar.set_postfix_str("Cloning...") + if force: + clone(repo, dest, updates, verbose=verbose) + pbar.update(1) - repo_path = os.path.join(dest, repo) - if not os.path.exists(repo_path): - row["error_msg"] = "Repo not cloned" - return + repo_path = os.path.join(dest, repo) + if not os.path.exists(repo_path): + updates["error_msg"] = "Repo not cloned" + return updates - build_file = get_build_file(dest, repo, row) - if build_file is None: - if verbose: print(f"Removing {repo}, no build file") - remove_dir(repo_path) - return - - if not has_tests(repo_path, build_file, row): - if verbose: print(f"Removing {repo}, no test suites") - remove_dir(repo_path) - return - # if verbose: print(f"Keeping {repo}") + pbar.set_postfix_str("Getting build file...") + build_file = get_build_file(dest, repo, updates) + if build_file is None: + if verbose: print(f"Removing {repo}, no build file") + remove_dir(repo_path) + return updates + pbar.update(1) + - # Check for compilation and tests + pbar.set_postfix_str("Checking for tests...") + # if not has_tests(repo_path, build_file, updates): + # if verbose: print(f"Removing {repo}, no test suites") + # remove_dir(repo_path) + # return + # if verbose: print(f"Keeping {repo}") + pbar.update(1) - # If repo was not removed, then it is a good repo - row["good_repo_for_crab"] = True + # Check for compilation and tests + + # If repo was not removed, then it is a good repo + updates["good_repo_for_crab"] = True + return updates def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None: """ @@ -162,26 +172,48 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) if verbose: print(f"Reading CSV file {file}") df = pd.read_csv(file) - df["successfully_cloned"] = None - df["build_system"] = None - df["depth_of_build_file"] = None - df["detected_source_of_tests"] = None - df["error_msg"] = None - df["good_repo_for_crab"] = False - df["n_tests"] = None - df["n_tests_with_grep"] = None - df["n_tests_passed"] = None - df["n_tests_failed"] = None - df["n_tests_skipped"] = None + # drop all columns besides the name + df = df[["name"]] + updates_list = [] # Collect updates in a list + + good_repos = 0 try: if verbose: print("Processing repositories") - df.progress_apply(lambda row: process_row(row, dest, force=force, verbose=verbose), axis=1) + with tqdm(total=len(df)) as pbar: + for i, row in df.iterrows(): + updates = process_row(row, dest, force=force, verbose=verbose) + if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]: + good_repos += 1 + pbar.update(1) + pbar.set_postfix({"repo": row["name"], "good_repos": good_repos}, refresh=True) + updates_list.append((i, updates)) # Collect updates except KeyboardInterrupt: print("Keyboard interrupt detected. Stopping the processing of the repos...") + + # Create columns for the new data + df = df.assign( + successfully_cloned=None, + build_system=None, + depth_of_build_file=None, + detected_source_of_tests=None, + error_msg=None, + good_repo_for_crab=False, + n_tests=None, + n_tests_with_grep=None, + n_tests_passed=None, + n_tests_failed=None, + n_tests_skipped=None + ) + + # Set the new data + for index, updates in updates_list: + for col, value in updates.items(): + df.at[index, col] = value # Batch updates to avoid fragmentation + if verbose: print("Writing results...") - df.to_csv("results.csv.gz", index=False) + df.to_csv("results.csv", index=False) if __name__ == "__main__":