extracted the process_row function because it was

becoming quite big
2025-10-13 19:58:02 +02:00 · 2025-02-28 14:14:20 +01:00
parent a6d7bdb865
commit a4be07c04e
1 changed files with 35 additions and 36 deletions
--- a/clone_repos.py
+++ b/clone_repos.py
@@ -117,33 +117,7 @@ def remove_dir(dir: str) -> None:
        shutil.rmtree(parent)


-def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
-    """
-    Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
-
-
-    Arguments:
-        file (str): The name of the file to download the repos from. Must be a .csv.gz file (downloaded from https://seart-ghs.si.usi.ch)
-        dest (str): The name of the root directory in which to download the repos
-        verbose (bool): If `True`, outputs detailed process information. Defaults to `False`.
-    """
-    if verbose: print(f"Reading CSV file {file}")
-    df = pd.read_csv(file)
-
-    df["successfully_cloned"] = None
-    df["build_system"] = None
-    df["depth_of_build_file"] = None
-    df["detected_source_of_tests"] = None
-    df["error_msg"] = None
-    df["good_repo_for_crab"] = False
-    df["n_tests"] = None
-    df["n_tests_with_grep"] = None
-    df["n_tests_passed"] = None
-    df["n_tests_failed"] = None
-    df["n_tests_skipped"] = None
-
-    if verbose: print("Cloning repositories")
-    def _process(row)->None:
+def process_row(row, dest: str, force: bool = False, verbose: bool = False):
    repo = row["name"]
    if repo in EXCLUSION_LIST:
        row["error_msg"] = "Repo in exclusion list"
@@ -175,13 +149,38 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False)
    # If repo was not removed, then it is a good repo
    row["good_repo_for_crab"] = True

+def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
+    """
+    Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
+
+
+    Arguments:
+        file (str): The name of the file to download the repos from. Must be a .csv.gz file (downloaded from https://seart-ghs.si.usi.ch)
+        dest (str): The name of the root directory in which to download the repos
+        verbose (bool): If `True`, outputs detailed process information. Defaults to `False`.
+    """
+    if verbose: print(f"Reading CSV file {file}")
+    df = pd.read_csv(file)
+
+    df["successfully_cloned"] = None
+    df["build_system"] = None
+    df["depth_of_build_file"] = None
+    df["detected_source_of_tests"] = None
+    df["error_msg"] = None
+    df["good_repo_for_crab"] = False
+    df["n_tests"] = None
+    df["n_tests_with_grep"] = None
+    df["n_tests_passed"] = None
+    df["n_tests_failed"] = None
+    df["n_tests_skipped"] = None

    try:
-        df.progress_apply(_process, axis=1)
+        if verbose: print("Processing repositories")
+        df.progress_apply(lambda row: process_row(row, dest, force=force, verbose=verbose), axis=1)
    except KeyboardInterrupt:
        print("Keyboard interrupt detected. Stopping the processing of the repos...")

-    if verbose: print("Writing CSV file")
+    if verbose: print("Writing results...")
    df.to_csv("results.csv.gz", index=False)