Files
crab/clone_repos.py
Karma Riuk 29541a15e9 now removing from output also the list of
unapproved licenses
2025-03-01 15:57:52 +01:00

394 lines
14 KiB
Python

import pandas as pd
import argparse, os, sys, subprocess, docker, re
from tqdm import tqdm
import shutil
tqdm.pandas()
USER_ID = os.getuid() # for container user
GROUP_ID = os.getgid()
EXCLUSION_LIST = [
"edmcouncil/idmp", # requires authentication
"aosp-mirror/platform_frameworks_base", # takes ages to clone
]
GRADLE_BASE_CMD = "gradle --no-daemon --console=plain"
MAVEN_BASE_CMD = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
# -Dstyle.color=never: Disables ANSI colors.
# -Dartifact.download.skip=true: Prevents Maven from printing download logs (but still downloads dependencies when needed).
def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
"""
Clones a GitHub repository into a local directory.
Args:
repo (str): The repository to clone, in the format "owner/repo_name".
force (bool, optional): If `True`, re-clones the repository even if it already exists. Defaults to `False`.
"""
local_repo_path = os.path.join(dest, repo)
if not force and os.path.exists(local_repo_path):
# if verbose: print(f"Skipping {repo}, already exists")
updates["cloned_successfully"] = "Already exists"
return
if verbose: print(f"Cloning {repo}")
proc = subprocess.run(
["git", "clone", "--depth", "1", f"https://github.com/{repo}", local_repo_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
if proc.returncode != 0:
updates["cloned_successfully"] = False
print(f"Failed to clone {repo}", file=sys.stderr)
print(f"Error message was:", file=sys.stderr)
error_msg = proc.stderr.decode()
print(error_msg, file=sys.stderr)
updates["error_msg"] = error_msg
else:
updates["cloned_successfully"] = True
def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
"""
Get the path to the build file of a repository. The build file is either a
`pom.xml`, `build.gradle`, or `build.xml` file.
Args:
root (str): The root directory in which the repository is located.
repo (str): The name of the repository.
Returns:
str | None: The path to the repository if it is valid, `None` otherwise
"""
path = os.path.join(root, repo)
# Check if the given path is a directory
if not os.path.isdir(path):
error_msg = f"The path {path} is not a valid directory."
print(error_msg, file=sys.stderr)
updates["error_msg"] = error_msg
return None
to_keep = ["pom.xml", "build.gradle"]
for entry in os.scandir(path):
if entry.is_file() and entry.name in to_keep:
if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning")
updates["depth_of_build_file"] = 0
if entry.name == "build.gradle":
updates["build_system"] = "gradle"
else:
updates["build_system"] = "maven"
return os.path.join(path, entry.name)
# List files in the immediate subdirectories
for entry in os.scandir(path):
if entry.is_dir():
for sub_entry in os.scandir(entry.path):
if sub_entry.is_file() and sub_entry.name in to_keep:
if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
updates["depth_of_build_file"] = 1
if entry.name == "build.gradle":
updates["build_system"] = "gradle"
else:
updates["build_system"] = "maven"
return os.path.join(path, entry.name, sub_entry.name)
updates["error_msg"] = "No build file found"
return None
def has_tests(path: str, build_file: str, updates: dict) -> bool:
with open(build_file, "r") as f:
content = f.read()
for library in ["junit", "testng", "mockito"]:
if library in content:
updates["detected_source_of_tests"] = library + " library in build file"
return True
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
if keyword in content:
updates["detected_source_of_tests"] = keyword + " keyword in build file"
return False
test_dirs = [
"src/test/java",
"src/test/kotlin",
"src/test/groovy",
"test",
]
for td in test_dirs:
if os.path.exists(os.path.join(path, td)):
updates["detected_source_of_tests"] = td + " dir exists in repo"
return True
updates["error_msg"] = "No tests found"
return False
def remove_dir(dir: str) -> None:
"""
Removes a directory and all its contents. Removes parent directorie if it is empty after removing child (dir).
Args:
dir (str): The directory to remove.
"""
shutil.rmtree(dir)
parent = os.path.abspath(os.path.join(dir, os.path.pardir))
if os.listdir(parent) == []:
shutil.rmtree(parent)
def merge_download_lines(lines: list) -> list:
"""
Merges lines that are part of the same download block in Maven output.
Args:
lines (list): The lines to merge.
Returns:
list: The merged lines.
"""
downloading_block = False
cleaned_lines = []
for line in lines:
if re.match(r"\[INFO\] Download(ing|ed) from", line):
if not downloading_block:
cleaned_lines.append("[CRAB] Downloading stuff")
downloading_block = True
else:
cleaned_lines.append(line)
downloading_block = False
return cleaned_lines
def merge_unapproved_licences(lines: list) -> list:
"""
Merges lines that are part of the same unapproved licences block in Maven output.
Args:
lines (list): The lines to merge.
Returns:
list: The merged lines.
"""
licenses_block = False
cleaned_lines = []
for line in lines:
if re.match(r"\[WARNING\] Files with unapproved licenses:", line):
cleaned_lines.append(line)
cleaned_lines.append("[CRAB] List of all the unapproved licenses...")
licenses_block = True
elif licenses_block and not re.match(r"\s+\?\/\.m2\/repository", line):
licenses_block = False
if not licenses_block:
cleaned_lines.append(line)
return cleaned_lines
def clean_output(output: bytes) -> str:
output_lines = output.decode().split("\n")
cleaned_lines = merge_download_lines(output_lines)
cleaned_lines = merge_unapproved_licences(cleaned_lines)
return "\n".join(cleaned_lines)
def compile_repo(build_file: str, container, updates: dict) -> bool:
"""
Attempts to compile a repository inside a running Docker container.
"""
if build_file.endswith("pom.xml"):
build_cmd = f"{MAVEN_BASE_CMD} clean compile"
elif build_file.endswith("build.gradle"):
build_cmd = f"{GRADLE_BASE_CMD} compileJava"
else:
updates["error_msg"] = "Unsupported build system for compiling: " + build_file
return False
exec_result = container.exec_run(build_cmd)
output = clean_output(exec_result.output)
if exec_result.exit_code != 0:
updates["compiled_successfully"] = False
updates["error_msg"] = output
return False
updates["compiled_successfully"] = True
return True
def test_repo(build_file: str, container, updates: dict) -> bool:
if build_file.endswith("pom.xml"):
test_cmd = f"{MAVEN_BASE_CMD} test"
elif build_file.endswith("build.gradle"):
test_cmd = f"{GRADLE_BASE_CMD} test"
else:
updates["error_msg"] = "Unsupported build system for testing: " + build_file
return False
exec_result = container.exec_run(test_cmd)
output = clean_output(exec_result.output)
if exec_result.exit_code != 0:
updates["tested_successfully"] = False
updates["error_msg"] = output
return False
updates["tested_successfully"] = True
updates["error_msg"] = output
return True
def clean_repo(build_file: str, container):
if build_file.endswith("pom.xml"):
clean_cmd = f"{MAVEN_BASE_CMD} clean"
elif build_file.endswith("build.gradle"):
clean_cmd = f"{GRADLE_BASE_CMD} clean"
else:
return
container.exec_run(clean_cmd)
def process_row(repo, client, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
updates["good_repo_for_crab"] = False
with tqdm(total=5, leave=False) as pbar:
if repo in EXCLUSION_LIST:
updates["error_msg"] = "Repo in exclusion list"
if verbose: print(f"Skipping {repo}, in exclusion list")
return
pbar.set_postfix_str("Cloning...")
if force:
clone(repo, dest, updates, verbose=verbose)
pbar.update(1)
repo_path = os.path.join(dest, repo)
if not os.path.exists(repo_path):
updates["error_msg"] = "Repo not cloned"
return
pbar.set_postfix_str("Getting build file...")
build_file = get_build_file(dest, repo, updates)
if build_file is None:
if verbose: print(f"Removing {repo}, no build file")
remove_dir(repo_path)
return
pbar.update(1)
pbar.set_postfix_str("Checking for tests...")
if not has_tests(repo_path, build_file, updates):
if verbose: print(f"Removing {repo}, no test suites")
remove_dir(repo_path)
return
if verbose: print(f"Keeping {repo}")
pbar.update(1)
container = client.containers.run(
image="crab-java-env",
command="tail -f /dev/null",
volumes={os.path.abspath(repo_path): {"bind": "/repo", "mode": "rw"}},
user=f"{USER_ID}:{GROUP_ID}",
detach=True,
tty=True
)
try:
pbar.set_postfix_str("Compiling...")
compiled = compile_repo(build_file, container, updates)
if not compiled:
if verbose: print(f"Removing {repo}, failed to compile")
clean_repo(build_file, container)
remove_dir(repo_path)
return
pbar.update(1)
pbar.set_postfix_str("Running tests...")
tested = test_repo(build_file, container, updates)
clean_repo(build_file, container)
if not tested:
if verbose: print(f"Removing {repo}, failed to run tests")
remove_dir(repo_path)
return
pbar.update(1)
# If repo was not removed, then it is a good repo
updates["good_repo_for_crab"] = True
finally:
container.kill()
container.remove()
def save_df_with_updates(df, updates_list, verbose=False):
# Create columns for the new data
df = df.assign(
cloned_successfully=None,
build_system=None,
depth_of_build_file=None,
detected_source_of_tests=None,
compiled_successfully=None,
tested_successfully=None,
n_tests=None,
n_tests_with_grep=None,
n_tests_passed=None,
n_tests_failed=None,
n_tests_skipped=None,
good_repo_for_crab=None,
error_msg=None,
)
# Set the new data
for index, updates in updates_list:
for col, value in updates.items():
df.at[index, col] = value # Batch updates to avoid fragmentation
if verbose: print("Writing results...")
df.to_csv("results.csv", index=False)
def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
"""
Download the repos listed in the file passed as argument. The downloaded repos will be placed in the folder that is named as the dest argument.
Arguments:
file (str): The name of the file to download the repos from. Must be a .csv.gz file (downloaded from https://seart-ghs.si.usi.ch)
dest (str): The name of the root directory in which to download the repos
verbose (bool): If `True`, outputs detailed process information. Defaults to `False`.
"""
if verbose: print(f"Reading CSV file {file}")
df = pd.read_csv(file)
# drop all columns besides the name
df = df[["name"]]
updates_list = [] # Collect updates in a list
client = docker.from_env()
good_repos = 0
try:
if verbose: print("Processing repositories")
with tqdm(total=len(df)) as pbar:
for i, row in df.iterrows():
pbar.set_postfix({"repo": row["name"], "good_repos": good_repos})
updates = {}
updates_list.append((i, updates)) # Collect updates
process_row(row["name"], client, dest, updates, force=force, verbose=verbose)
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
good_repos += 1
pbar.update(1)
except KeyboardInterrupt as e:
print("Interrupted by user, saving progress...")
save_df_with_updates(df, updates_list, verbose=verbose)
raise e
except Exception as e:
print("An error occured, saving progress and then raising the error...")
save_df_with_updates(df, updates_list, verbose=verbose)
raise e
if verbose: print("Saving results...")
save_df_with_updates(df, updates_list, verbose=verbose)
if __name__ == "__main__":
# whtie the code to parse the arguments here
parser = argparse.ArgumentParser(description="Clone repos from a given file")
parser.add_argument("file", default="results.csv.gz", help="The file to download the repos from. Default is 'results.csv.gz'")
parser.add_argument("-d", "--dest", default="./results/", help="The root directory in which to download the repos. Default is './results/'")
parser.add_argument("-f", "--force", action="store_true", help="Force the download of the repos")
parser.add_argument("-v", "--verbose", action="store_true", help="Make the program verbose")
args = parser.parse_args()
clone_repos(args.file, args.dest, force=args.force, verbose=args.verbose)