implemented lazy evaluation (ignore repo if already processed)

2025-10-14 04:08:01 +02:00 · 2025-03-02 11:01:12 +01:00
parent fb220eac1e
commit b1d98df292
1 changed files with 22 additions and 9 deletions
--- a/clone_repos.py
+++ b/clone_repos.py
@@ -3,6 +3,7 @@ import argparse, os, sys, subprocess, docker
 from tqdm import tqdm
 import shutil
 from typing import Optional
 import numpy as np
 from handlers import GradleHandler, MavenHandler, BuildHandler
@@ -159,7 +160,7 @@ def process_row(repo, client, dest: str, updates: dict, force: bool = False, ver
            # If repo was not removed, then it is a good repo
            updates["good_repo_for_crab"] = True
-def save_df_with_updates(df, updates_list, verbose=False):
+def save_df_with_updates(df, updates_list, results_file: str, verbose=False):
    # Create columns for the new data
    df = df.assign(
        cloned_successfully=None,
@@ -184,9 +185,9 @@ def save_df_with_updates(df, updates_list, verbose=False):
            df.at[index, col] = value  # Batch updates to avoid fragmentation
    if verbose: print("Writing results...")
-    df.to_csv("results.csv", index=False)
+    df.to_csv(results_file, index=False)
-def process_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
+def process_repos(file: str, dest: str, results_file: str, /, lazy: bool = False, force: bool =False, verbose: bool = False) -> None:
    """
    Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
@@ -198,6 +199,7 @@ def process_repos(file: str, dest: str, force: bool =False, verbose: bool = Fals
    """
    if verbose: print(f"Reading CSV file {file}")
    df = pd.read_csv(file)
    results_df = pd.read_csv(results_file) if lazy else None
    # drop all columns besides the name
    df = df[["name"]]
@@ -206,45 +208,56 @@ def process_repos(file: str, dest: str, force: bool =False, verbose: bool = Fals
    client = docker.from_env()
    good_repos = 0
    n_processed = 0
    last_i_saved = -1
    try:
        if verbose: print("Processing repositories")
        with tqdm(total=len(df)) as pbar:
            for i, row in df.iterrows():
                if i % 10 == 0:
-                    save_df_with_updates(df, updates_list, verbose=verbose)
+                    save_df_with_updates(df, updates_list, results_file, verbose=verbose)
                    last_i_saved = i
                pbar.set_postfix({
                    "repo": row["name"],
                    "last index saved": last_i_saved,
-                    "# good repos": f"{good_repos} ({good_repos/len(updates_list) if len(updates_list) > 0 else 0:.2%})", 
+                    "# good repos": f"{good_repos} ({good_repos/n_processed if n_processed > 0 else 0:.2%})", 
                })
                if lazy:
                    already_good_for_crab = results_df[results_df["name"] == row["name"]].iloc[0]["good_repo_for_crab"]
                    if not np.isnan(already_good_for_crab):
                        pbar.update(1)
                        n_processed += 1
                        good_repos += 1 if already_good_for_crab else 0
                        continue
                updates = {}
                updates_list.append((i, updates))  # Collect updates
                process_row(row["name"], client, dest, updates, force=force, verbose=verbose)
                if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
                    good_repos += 1
                pbar.update(1)
                n_processed += 1
    except KeyboardInterrupt as e:
        print("Interrupted by user, saving progress...")
-        save_df_with_updates(df, updates_list, verbose=verbose)
+        save_df_with_updates(df, updates_list, results_file, verbose=verbose)
        raise e
    except Exception as e:
        print("An error occured, saving progress and then raising the error...")
-        save_df_with_updates(df, updates_list, verbose=verbose)
+        save_df_with_updates(df, updates_list, results_file, verbose=verbose)
        raise e
    if verbose: print("Saving results...")
-    save_df_with_updates(df, updates_list, verbose=verbose)
+    save_df_with_updates(df, updates_list, results_file, verbose=verbose)
 if __name__ == "__main__":
    # whtie the code to parse the arguments here
    parser = argparse.ArgumentParser(description="Clone repos from a given file")
    parser.add_argument("file", default="results.csv.gz", help="The file to download the repos from. Default is 'results.csv.gz'")
    parser.add_argument("-d", "--dest", default="./results/", help="The root directory in which to download the repos. Default is './results/'")
    parser.add_argument("-r", "--results", default="results.csv", help="The name of file in which to save the results. Also used with --continue. Default is 'results.csv'")
    parser.add_argument("-l", "--lazy", action="store_true", help="If given, the program will continue from where it left off, by not touch the already processed repos. Will look at the file pointed by the --results argument")
    parser.add_argument("-f", "--force", action="store_true", help="Force the download of the repos")
    parser.add_argument("-v", "--verbose", action="store_true", help="Make the program verbose")
    args = parser.parse_args()
-    process_repos(args.file, args.dest, force=args.force, verbose=args.verbose)
+    process_repos(args.file, args.dest, args.results, lazy=args.lazy, force=args.force, verbose=args.verbose)