now the rows actually get updated

2025-07-05 13:38:12 +02:00 · 2025-02-28 15:25:56 +01:00
parent a4be07c04e
commit b918c0044c
1 changed files with 85 additions and 53 deletions
--- a/clone_repos.py
+++ b/clone_repos.py
@ -9,7 +9,7 @@ EXCLUSION_LIST = [
    "edmcouncil/idmp",
 ]
-def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bool = False) -> None:
+def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
    """
    Clones a GitHub repository into a local directory.
@ -28,16 +28,16 @@ def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bo
        stderr=subprocess.PIPE
    )
    if proc.returncode != 0:
-        row["successfully_cloned"] = False
+        updates["successfully_cloned"] = False
        print(f"Failed to clone {repo}", file=sys.stderr)
        print(f"Error message was:", file=sys.stderr)
        error_msg = proc.stderr.decode()
        print(error_msg, file=sys.stderr)
-        row["error_msg"] = error_msg
+        updates["error_msg"] = error_msg
    else:
-        row["successfully_cloned"] = True
+        updates["successfully_cloned"] = True
-def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
+def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
    """
    Get the path to the build file of a repository. The build file is either a
    `pom.xml`, `build.gradle`, or `build.xml` file.
@ -54,14 +54,14 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
    if not os.path.isdir(path):
        error_msg = f"The path {path} is not a valid directory."
        print(error_msg, file=sys.stderr)
-        row["error_msg"] = error_msg
+        updates["error_msg"] = error_msg
        return None
    to_keep = ["pom.xml", "build.gradle", "build.xml"]
    for entry in os.scandir(path):
        if entry.is_file() and entry.name in to_keep:
            if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning")
-            row["depth_of_build_file"] = 0
+            updates["depth_of_build_file"] = 0
            return os.path.join(path, entry.name)
    # List files in the immediate subdirectories
@ -70,24 +70,24 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
            for sub_entry in os.scandir(entry.path):
                if sub_entry.is_file() and sub_entry.name in to_keep:
                    if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
-                    row["depth_of_build_file"] = 1
+                    updates["depth_of_build_file"] = 1
                    return os.path.join(path, entry.name, sub_entry.name)
-    row["error_msg"] = "No build file found"
+    updates["error_msg"] = "No build file found"
    return None
-def has_tests(path: str, build_file: str, row: pd.Series) -> bool:
+def has_tests(path: str, build_file: str, updates: dict) -> bool:
    with open(build_file, "r") as f:
        content = f.read()
        for library in ["junit", "testng", "mockito"]:
            if library in content:
-                row["detected_source_of_tests"] = library + " library in build file"
+                updates["detected_source_of_tests"] = library + " library in build file"
                return True
        for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
            if keyword in content:
-                row["detected_source_of_tests"] = keyword + " keyword in build file"
+                updates["detected_source_of_tests"] = keyword + " keyword in build file"
                return False
    test_dirs = [
@ -98,10 +98,10 @@ def has_tests(path: str, build_file: str, row: pd.Series) -> bool:
    ]
    for td in test_dirs:
        if os.path.exists(os.path.join(path, td)):
-            row["detected_source_of_tests"] = td + " dir exists in repo"
+            updates["detected_source_of_tests"] = td + " dir exists in repo"
            return True
-    row["error_msg"] = "No tests found"
+    updates["error_msg"] = "No tests found"
    return False
 def remove_dir(dir: str) -> None:
@ -117,37 +117,47 @@ def remove_dir(dir: str) -> None:
        shutil.rmtree(parent)
-def process_row(row, dest: str, force: bool = False, verbose: bool = False):
+def process_row(row, dest: str, force: bool = False, verbose: bool = False) -> dict:
    updates = {}  # Dictionary to store updates
    with tqdm(total=3, leave=False) as pbar:
        repo = row["name"]
        if repo in EXCLUSION_LIST:
-        row["error_msg"] = "Repo in exclusion list"
+            updates["error_msg"] = "Repo in exclusion list"
            if verbose: print(f"Skipping {repo}, in exclusion list")
-        return
+            return updates
        pbar.set_postfix_str("Cloning...")
        if force:
-        clone(repo, dest, row, verbose=verbose)
+            clone(repo, dest, updates, verbose=verbose)
        pbar.update(1)
        repo_path = os.path.join(dest, repo)
        if not os.path.exists(repo_path):
-        row["error_msg"] = "Repo not cloned"
+            updates["error_msg"] = "Repo not cloned"
-        return
+            return updates
-    build_file = get_build_file(dest, repo, row)
+        pbar.set_postfix_str("Getting build file...")
        build_file = get_build_file(dest, repo, updates)
        if build_file is None:
            if verbose: print(f"Removing {repo}, no build file")
            remove_dir(repo_path)
-        return
+            return updates
        pbar.update(1)
-    if not has_tests(repo_path, build_file, row):
+
-        if verbose: print(f"Removing {repo}, no test suites")
+        pbar.set_postfix_str("Checking for tests...")
-        remove_dir(repo_path)
+        # if not has_tests(repo_path, build_file, updates):
-        return
+        #     if verbose: print(f"Removing {repo}, no test suites")
        #     remove_dir(repo_path)
        #     return
        # if verbose: print(f"Keeping {repo}")
        pbar.update(1)
        # Check for compilation and tests
        # If repo was not removed, then it is a good repo
-    row["good_repo_for_crab"] = True
+        updates["good_repo_for_crab"] = True
    return updates
 def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
    """
@ -162,26 +172,48 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False)
    if verbose: print(f"Reading CSV file {file}")
    df = pd.read_csv(file)
-    df["successfully_cloned"] = None
+    # drop all columns besides the name
-    df["build_system"] = None
+    df = df[["name"]]
    df["depth_of_build_file"] = None
    df["detected_source_of_tests"] = None
    df["error_msg"] = None
    df["good_repo_for_crab"] = False
    df["n_tests"] = None
    df["n_tests_with_grep"] = None
    df["n_tests_passed"] = None
    df["n_tests_failed"] = None
    df["n_tests_skipped"] = None
    updates_list = []  # Collect updates in a list
    good_repos = 0
    try:
        if verbose: print("Processing repositories")
-        df.progress_apply(lambda row: process_row(row, dest, force=force, verbose=verbose), axis=1)
+        with tqdm(total=len(df)) as pbar:
            for i, row in df.iterrows():
                updates = process_row(row, dest, force=force, verbose=verbose)
                if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
                    good_repos += 1
                pbar.update(1)
                pbar.set_postfix({"repo": row["name"], "good_repos": good_repos}, refresh=True)
                updates_list.append((i, updates))  # Collect updates
    except KeyboardInterrupt:
        print("Keyboard interrupt detected. Stopping the processing of the repos...")
    # Create columns for the new data
    df = df.assign(
        successfully_cloned=None,
        build_system=None,
        depth_of_build_file=None,
        detected_source_of_tests=None,
        error_msg=None,
        good_repo_for_crab=False,
        n_tests=None,
        n_tests_with_grep=None,
        n_tests_passed=None,
        n_tests_failed=None,
        n_tests_skipped=None
    )
   # Set the new data
    for index, updates in updates_list:
        for col, value in updates.items():
            df.at[index, col] = value  # Batch updates to avoid fragmentation
    if verbose: print("Writing results...")
-    df.to_csv("results.csv.gz", index=False)
+    df.to_csv("results.csv", index=False)
 if __name__ == "__main__":