mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 21:38:13 +02:00
implemented lazy evaluation (ignore repo if already processed)
This commit is contained in:
@ -3,6 +3,7 @@ import argparse, os, sys, subprocess, docker
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import shutil
|
import shutil
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from handlers import GradleHandler, MavenHandler, BuildHandler
|
from handlers import GradleHandler, MavenHandler, BuildHandler
|
||||||
|
|
||||||
@ -159,7 +160,7 @@ def process_row(repo, client, dest: str, updates: dict, force: bool = False, ver
|
|||||||
# If repo was not removed, then it is a good repo
|
# If repo was not removed, then it is a good repo
|
||||||
updates["good_repo_for_crab"] = True
|
updates["good_repo_for_crab"] = True
|
||||||
|
|
||||||
def save_df_with_updates(df, updates_list, verbose=False):
|
def save_df_with_updates(df, updates_list, results_file: str, verbose=False):
|
||||||
# Create columns for the new data
|
# Create columns for the new data
|
||||||
df = df.assign(
|
df = df.assign(
|
||||||
cloned_successfully=None,
|
cloned_successfully=None,
|
||||||
@ -184,9 +185,9 @@ def save_df_with_updates(df, updates_list, verbose=False):
|
|||||||
df.at[index, col] = value # Batch updates to avoid fragmentation
|
df.at[index, col] = value # Batch updates to avoid fragmentation
|
||||||
|
|
||||||
if verbose: print("Writing results...")
|
if verbose: print("Writing results...")
|
||||||
df.to_csv("results.csv", index=False)
|
df.to_csv(results_file, index=False)
|
||||||
|
|
||||||
def process_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
|
def process_repos(file: str, dest: str, results_file: str, /, lazy: bool = False, force: bool =False, verbose: bool = False) -> None:
|
||||||
"""
|
"""
|
||||||
Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
|
Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
|
||||||
|
|
||||||
@ -198,6 +199,7 @@ def process_repos(file: str, dest: str, force: bool =False, verbose: bool = Fals
|
|||||||
"""
|
"""
|
||||||
if verbose: print(f"Reading CSV file {file}")
|
if verbose: print(f"Reading CSV file {file}")
|
||||||
df = pd.read_csv(file)
|
df = pd.read_csv(file)
|
||||||
|
results_df = pd.read_csv(results_file) if lazy else None
|
||||||
|
|
||||||
# drop all columns besides the name
|
# drop all columns besides the name
|
||||||
df = df[["name"]]
|
df = df[["name"]]
|
||||||
@ -206,45 +208,56 @@ def process_repos(file: str, dest: str, force: bool =False, verbose: bool = Fals
|
|||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
|
|
||||||
good_repos = 0
|
good_repos = 0
|
||||||
|
n_processed = 0
|
||||||
last_i_saved = -1
|
last_i_saved = -1
|
||||||
try:
|
try:
|
||||||
if verbose: print("Processing repositories")
|
if verbose: print("Processing repositories")
|
||||||
with tqdm(total=len(df)) as pbar:
|
with tqdm(total=len(df)) as pbar:
|
||||||
for i, row in df.iterrows():
|
for i, row in df.iterrows():
|
||||||
if i % 10 == 0:
|
if i % 10 == 0:
|
||||||
save_df_with_updates(df, updates_list, verbose=verbose)
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
||||||
last_i_saved = i
|
last_i_saved = i
|
||||||
pbar.set_postfix({
|
pbar.set_postfix({
|
||||||
"repo": row["name"],
|
"repo": row["name"],
|
||||||
"last index saved": last_i_saved,
|
"last index saved": last_i_saved,
|
||||||
"# good repos": f"{good_repos} ({good_repos/len(updates_list) if len(updates_list) > 0 else 0:.2%})",
|
"# good repos": f"{good_repos} ({good_repos/n_processed if n_processed > 0 else 0:.2%})",
|
||||||
})
|
})
|
||||||
|
if lazy:
|
||||||
|
already_good_for_crab = results_df[results_df["name"] == row["name"]].iloc[0]["good_repo_for_crab"]
|
||||||
|
if not np.isnan(already_good_for_crab):
|
||||||
|
pbar.update(1)
|
||||||
|
n_processed += 1
|
||||||
|
good_repos += 1 if already_good_for_crab else 0
|
||||||
|
continue
|
||||||
updates = {}
|
updates = {}
|
||||||
updates_list.append((i, updates)) # Collect updates
|
updates_list.append((i, updates)) # Collect updates
|
||||||
process_row(row["name"], client, dest, updates, force=force, verbose=verbose)
|
process_row(row["name"], client, dest, updates, force=force, verbose=verbose)
|
||||||
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
|
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
|
||||||
good_repos += 1
|
good_repos += 1
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
n_processed += 1
|
||||||
except KeyboardInterrupt as e:
|
except KeyboardInterrupt as e:
|
||||||
print("Interrupted by user, saving progress...")
|
print("Interrupted by user, saving progress...")
|
||||||
save_df_with_updates(df, updates_list, verbose=verbose)
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("An error occured, saving progress and then raising the error...")
|
print("An error occured, saving progress and then raising the error...")
|
||||||
save_df_with_updates(df, updates_list, verbose=verbose)
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
if verbose: print("Saving results...")
|
if verbose: print("Saving results...")
|
||||||
save_df_with_updates(df, updates_list, verbose=verbose)
|
save_df_with_updates(df, updates_list, results_file, verbose=verbose)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# whtie the code to parse the arguments here
|
# whtie the code to parse the arguments here
|
||||||
parser = argparse.ArgumentParser(description="Clone repos from a given file")
|
parser = argparse.ArgumentParser(description="Clone repos from a given file")
|
||||||
parser.add_argument("file", default="results.csv.gz", help="The file to download the repos from. Default is 'results.csv.gz'")
|
parser.add_argument("file", default="results.csv.gz", help="The file to download the repos from. Default is 'results.csv.gz'")
|
||||||
parser.add_argument("-d", "--dest", default="./results/", help="The root directory in which to download the repos. Default is './results/'")
|
parser.add_argument("-d", "--dest", default="./results/", help="The root directory in which to download the repos. Default is './results/'")
|
||||||
|
parser.add_argument("-r", "--results", default="results.csv", help="The name of file in which to save the results. Also used with --continue. Default is 'results.csv'")
|
||||||
|
parser.add_argument("-l", "--lazy", action="store_true", help="If given, the program will continue from where it left off, by not touch the already processed repos. Will look at the file pointed by the --results argument")
|
||||||
parser.add_argument("-f", "--force", action="store_true", help="Force the download of the repos")
|
parser.add_argument("-f", "--force", action="store_true", help="Force the download of the repos")
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="Make the program verbose")
|
parser.add_argument("-v", "--verbose", action="store_true", help="Make the program verbose")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
process_repos(args.file, args.dest, force=args.force, verbose=args.verbose)
|
process_repos(args.file, args.dest, args.results, lazy=args.lazy, force=args.force, verbose=args.verbose)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user