extracted the process_row function because it was

becoming quite big
2025-07-05 13:38:12 +02:00 · 2025-02-28 14:14:20 +01:00
parent a6d7bdb865
commit a4be07c04e
1 changed files with 35 additions and 36 deletions
--- a/clone_repos.py
+++ b/clone_repos.py
@ -117,33 +117,7 @@ def remove_dir(dir: str) -> None:
        shutil.rmtree(parent)
-def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
+def process_row(row, dest: str, force: bool = False, verbose: bool = False):
    """
    Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
    Arguments:
        file (str): The name of the file to download the repos from. Must be a .csv.gz file (downloaded from https://seart-ghs.si.usi.ch)
        dest (str): The name of the root directory in which to download the repos
        verbose (bool): If `True`, outputs detailed process information. Defaults to `False`.
    """
    if verbose: print(f"Reading CSV file {file}")
    df = pd.read_csv(file)
    df["successfully_cloned"] = None
    df["build_system"] = None
    df["depth_of_build_file"] = None
    df["detected_source_of_tests"] = None
    df["error_msg"] = None
    df["good_repo_for_crab"] = False
    df["n_tests"] = None
    df["n_tests_with_grep"] = None
    df["n_tests_passed"] = None
    df["n_tests_failed"] = None
    df["n_tests_skipped"] = None
    if verbose: print("Cloning repositories")
    def _process(row)->None:
    repo = row["name"]
    if repo in EXCLUSION_LIST:
        row["error_msg"] = "Repo in exclusion list"
@ -175,13 +149,38 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False)
    # If repo was not removed, then it is a good repo
    row["good_repo_for_crab"] = True
 def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
    """
    Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
    Arguments:
        file (str): The name of the file to download the repos from. Must be a .csv.gz file (downloaded from https://seart-ghs.si.usi.ch)
        dest (str): The name of the root directory in which to download the repos
        verbose (bool): If `True`, outputs detailed process information. Defaults to `False`.
    """
    if verbose: print(f"Reading CSV file {file}")
    df = pd.read_csv(file)
    df["successfully_cloned"] = None
    df["build_system"] = None
    df["depth_of_build_file"] = None
    df["detected_source_of_tests"] = None
    df["error_msg"] = None
    df["good_repo_for_crab"] = False
    df["n_tests"] = None
    df["n_tests_with_grep"] = None
    df["n_tests_passed"] = None
    df["n_tests_failed"] = None
    df["n_tests_skipped"] = None
    try:
-        df.progress_apply(_process, axis=1)
+        if verbose: print("Processing repositories")
        df.progress_apply(lambda row: process_row(row, dest, force=force, verbose=verbose), axis=1)
    except KeyboardInterrupt:
        print("Keyboard interrupt detected. Stopping the processing of the repos...")
-    if verbose: print("Writing CSV file")
+    if verbose: print("Writing results...")
    df.to_csv("results.csv.gz", index=False)