Merge branch 'handler-class'

This commit is contained in:
Karma Riuk
2025-03-01 17:25:26 +01:00
2 changed files with 219 additions and 178 deletions

View File

@ -1,24 +1,18 @@
import pandas as pd import pandas as pd
import argparse, os, sys, subprocess, docker, re import argparse, os, sys, subprocess, docker
from tqdm import tqdm from tqdm import tqdm
import shutil import shutil
from typing import Optional
from handlers import GradleHandler, MavenHandler, BuildHandler
tqdm.pandas() tqdm.pandas()
USER_ID = os.getuid() # for container user
GROUP_ID = os.getgid()
EXCLUSION_LIST = [ EXCLUSION_LIST = [
"edmcouncil/idmp", # requires authentication "edmcouncil/idmp", # requires authentication
"aosp-mirror/platform_frameworks_base", # takes ages to clone "aosp-mirror/platform_frameworks_base", # takes ages to clone
] ]
GRADLE_BASE_CMD = "gradle --no-daemon --console=plain"
MAVEN_BASE_CMD = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
# -Dstyle.color=never: Disables ANSI colors.
# -Dartifact.download.skip=true: Prevents Maven from printing download logs (but still downloads dependencies when needed).
def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None: def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
""" """
Clones a GitHub repository into a local directory. Clones a GitHub repository into a local directory.
@ -49,7 +43,7 @@ def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: boo
else: else:
updates["cloned_successfully"] = True updates["cloned_successfully"] = True
def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False): def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
""" """
Get the path to the build file of a repository. The build file is either a Get the path to the build file of a repository. The build file is either a
`pom.xml`, `build.gradle`, or `build.xml` file. `pom.xml`, `build.gradle`, or `build.xml` file.
@ -76,9 +70,10 @@ def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
updates["depth_of_build_file"] = 0 updates["depth_of_build_file"] = 0
if entry.name == "build.gradle": if entry.name == "build.gradle":
updates["build_system"] = "gradle" updates["build_system"] = "gradle"
return GradleHandler(path, entry.name, updates)
else: else:
updates["build_system"] = "maven" updates["build_system"] = "maven"
return os.path.join(path, entry.name) return MavenHandler(path, entry.name, updates)
# List files in the immediate subdirectories # List files in the immediate subdirectories
for entry in os.scandir(path): for entry in os.scandir(path):
@ -89,41 +84,14 @@ def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
updates["depth_of_build_file"] = 1 updates["depth_of_build_file"] = 1
if entry.name == "build.gradle": if entry.name == "build.gradle":
updates["build_system"] = "gradle" updates["build_system"] = "gradle"
return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
else: else:
updates["build_system"] = "maven" updates["build_system"] = "maven"
return os.path.join(path, entry.name, sub_entry.name) return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
updates["error_msg"] = "No build file found" updates["error_msg"] = "No build file found"
return None return None
def has_tests(path: str, build_file: str, updates: dict) -> bool:
with open(build_file, "r") as f:
content = f.read()
for library in ["junit", "testng", "mockito"]:
if library in content:
updates["detected_source_of_tests"] = library + " library in build file"
return True
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
if keyword in content:
updates["detected_source_of_tests"] = keyword + " keyword in build file"
return False
test_dirs = [
"src/test/java",
"src/test/kotlin",
"src/test/groovy",
"test",
]
for td in test_dirs:
if os.path.exists(os.path.join(path, td)):
updates["detected_source_of_tests"] = td + " dir exists in repo"
return True
updates["error_msg"] = "No tests found"
return False
def remove_dir(dir: str) -> None: def remove_dir(dir: str) -> None:
""" """
Removes a directory and all its contents. Removes parent directorie if it is empty after removing child (dir). Removes a directory and all its contents. Removes parent directorie if it is empty after removing child (dir).
@ -136,113 +104,6 @@ def remove_dir(dir: str) -> None:
if os.listdir(parent) == []: if os.listdir(parent) == []:
shutil.rmtree(parent) shutil.rmtree(parent)
def merge_download_lines(lines: list) -> list:
"""
Merges lines that are part of the same download block in Maven output.
Args:
lines (list): The lines to merge.
Returns:
list: The merged lines.
"""
downloading_block = False
cleaned_lines = []
for line in lines:
if re.match(r"\[INFO\] Download(ing|ed) from", line):
if not downloading_block:
cleaned_lines.append("[CRAB] Downloading stuff")
downloading_block = True
else:
cleaned_lines.append(line)
downloading_block = False
return cleaned_lines
def merge_unapproved_licences(lines: list) -> list:
"""
Merges lines that are part of the same unapproved licences block in Maven output.
Args:
lines (list): The lines to merge.
Returns:
list: The merged lines.
"""
licenses_block = False
cleaned_lines = []
for line in lines:
if re.match(r"\[WARNING\] Files with unapproved licenses:", line):
cleaned_lines.append(line)
cleaned_lines.append("[CRAB] List of all the unapproved licenses...")
licenses_block = True
elif licenses_block and not re.match(r"\s+\?\/\.m2\/repository", line):
licenses_block = False
if not licenses_block:
cleaned_lines.append(line)
return cleaned_lines
def clean_output(output: bytes) -> str:
output_lines = output.decode().split("\n")
cleaned_lines = merge_download_lines(output_lines)
cleaned_lines = merge_unapproved_licences(cleaned_lines)
return "\n".join(cleaned_lines)
def compile_repo(build_file: str, container, updates: dict) -> bool:
"""
Attempts to compile a repository inside a running Docker container.
"""
if build_file.endswith("pom.xml"):
build_cmd = f"{MAVEN_BASE_CMD} clean compile"
elif build_file.endswith("build.gradle"):
build_cmd = f"{GRADLE_BASE_CMD} compileJava"
else:
updates["error_msg"] = "Unsupported build system for compiling: " + build_file
return False
exec_result = container.exec_run(build_cmd)
output = clean_output(exec_result.output)
if exec_result.exit_code != 0:
updates["compiled_successfully"] = False
updates["error_msg"] = output
return False
updates["compiled_successfully"] = True
return True
def test_repo(build_file: str, container, updates: dict) -> bool:
if build_file.endswith("pom.xml"):
test_cmd = f"{MAVEN_BASE_CMD} test"
elif build_file.endswith("build.gradle"):
test_cmd = f"{GRADLE_BASE_CMD} test"
else:
updates["error_msg"] = "Unsupported build system for testing: " + build_file
return False
exec_result = container.exec_run(test_cmd)
output = clean_output(exec_result.output)
if exec_result.exit_code != 0:
updates["tested_successfully"] = False
updates["error_msg"] = output
return False
updates["tested_successfully"] = True
updates["error_msg"] = output
return True
def clean_repo(build_file: str, container):
if build_file.endswith("pom.xml"):
clean_cmd = f"{MAVEN_BASE_CMD} clean"
elif build_file.endswith("build.gradle"):
clean_cmd = f"{GRADLE_BASE_CMD} clean"
else:
return
container.exec_run(clean_cmd)
def process_row(repo, client, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None: def process_row(repo, client, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
updates["good_repo_for_crab"] = False updates["good_repo_for_crab"] = False
with tqdm(total=5, leave=False) as pbar: with tqdm(total=5, leave=False) as pbar:
@ -261,55 +122,41 @@ def process_row(repo, client, dest: str, updates: dict, force: bool = False, ver
updates["error_msg"] = "Repo not cloned" updates["error_msg"] = "Repo not cloned"
return return
pbar.set_postfix_str("Getting build file...") pbar.set_postfix_str("Getting build handler...")
build_file = get_build_file(dest, repo, updates) build_handler = get_build_handler(dest, repo, updates)
if build_file is None: if build_handler is None:
if verbose: print(f"Removing {repo}, no build file") if verbose: print(f"Removing {repo}, no build file")
remove_dir(repo_path) remove_dir(repo_path)
return return
pbar.update(1) pbar.update(1)
build_handler.set_client(client)
with build_handler:
pbar.set_postfix_str("Checking for tests...") pbar.set_postfix_str("Checking for tests...")
if not has_tests(repo_path, build_file, updates): if not build_handler.has_tests():
if verbose: print(f"Removing {repo}, no test suites") if verbose: print(f"Removing {repo}, no test suites")
remove_dir(repo_path) remove_dir(repo_path)
return return
if verbose: print(f"Keeping {repo}") if verbose: print(f"Keeping {repo}")
pbar.update(1) pbar.update(1)
container = client.containers.run(
image="crab-java-env",
command="tail -f /dev/null",
volumes={os.path.abspath(repo_path): {"bind": "/repo", "mode": "rw"}},
user=f"{USER_ID}:{GROUP_ID}",
detach=True,
tty=True
)
try:
pbar.set_postfix_str("Compiling...") pbar.set_postfix_str("Compiling...")
compiled = compile_repo(build_file, container, updates) if not build_handler.compile_repo():
if not compiled:
if verbose: print(f"Removing {repo}, failed to compile") if verbose: print(f"Removing {repo}, failed to compile")
clean_repo(build_file, container)
remove_dir(repo_path) remove_dir(repo_path)
return return
pbar.update(1) pbar.update(1)
pbar.set_postfix_str("Running tests...") pbar.set_postfix_str("Running tests...")
tested = test_repo(build_file, container, updates) if not build_handler.test_repo():
clean_repo(build_file, container)
if not tested:
if verbose: print(f"Removing {repo}, failed to run tests") if verbose: print(f"Removing {repo}, failed to run tests")
remove_dir(repo_path) remove_dir(repo_path)
return return
build_handler.clean_repo()
pbar.update(1) pbar.update(1)
# If repo was not removed, then it is a good repo # If repo was not removed, then it is a good repo
updates["good_repo_for_crab"] = True updates["good_repo_for_crab"] = True
finally:
container.kill()
container.remove()
def save_df_with_updates(df, updates_list, verbose=False): def save_df_with_updates(df, updates_list, verbose=False):
# Create columns for the new data # Create columns for the new data
@ -361,7 +208,7 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False)
if verbose: print("Processing repositories") if verbose: print("Processing repositories")
with tqdm(total=len(df)) as pbar: with tqdm(total=len(df)) as pbar:
for i, row in df.iterrows(): for i, row in df.iterrows():
pbar.set_postfix({"repo": row["name"], "good_repos": good_repos}) pbar.set_postfix({"repo": row["name"], "good_repos": good_repos, "n_gradle": sum(updates["build_system"] == "gradle" for _, updates in updates_list if "build_system" in updates)})
updates = {} updates = {}
updates_list.append((i, updates)) # Collect updates updates_list.append((i, updates)) # Collect updates
process_row(row["name"], client, dest, updates, force=force, verbose=verbose) process_row(row["name"], client, dest, updates, force=force, verbose=verbose)

194
handlers.py Normal file
View File

@ -0,0 +1,194 @@
from abc import ABC, abstractmethod
import os, re, docker
USER_ID = os.getuid() # for container user
GROUP_ID = os.getgid()
class BuildHandler(ABC):
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
super().__init__()
self.path: str = repo_path
# self.container: Optional[Container] = None
self.build_file: str = build_file
self.updates = updates
def set_client(self, client: docker.DockerClient):
self.client = client
def __enter__(self):
self.container = self.client.containers.run(
image=self.container_name(),
command="tail -f /dev/null", # to keep the container alive
volumes={os.path.abspath(self.path): {"bind": "/repo", "mode": "rw"}},
user=f"{USER_ID}:{GROUP_ID}",
detach=True,
tty=True
)
def __exit__(self, *args):
self.container.kill()
self.container.remove()
def has_tests(self) -> bool:
with open(os.path.join(self.path, self.build_file), "r") as f:
content = f.read()
for library in ["junit", "testng", "mockito"]:
if library in content:
self.updates["detected_source_of_tests"] = library + " library in build file"
return True
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
if keyword in content:
self.updates["detected_source_of_tests"] = keyword + " keyword in build file"
return False
test_dirs = [
"src/test/java",
"src/test/kotlin",
"src/test/groovy",
"test",
]
for td in test_dirs:
if os.path.exists(os.path.join(self.path, td)):
self.updates["detected_source_of_tests"] = td + " dir exists in repo"
return True
self.updates["error_msg"] = "No tests found"
return False
def compile_repo(self) -> bool:
exec_result = self.container.exec_run(self.compile_cmd())
output = clean_output(exec_result.output)
if exec_result.exit_code != 0:
self.updates["compiled_successfully"] = False
self.updates["error_msg"] = output
return False
self.updates["compiled_successfully"] = True
return True
def test_repo(self) -> bool:
exec_result = self.container.exec_run(self.test_cmd())
output = clean_output(exec_result.output)
if exec_result.exit_code != 0:
self.updates["tested_successfully"] = False
self.updates["error_msg"] = output
return False
self.updates["tested_successfully"] = True
self.updates["error_msg"] = output
return True
def clean_repo(self) -> None:
self.container.exec_run(self.clean_cmd())
@abstractmethod
def compile_cmd(self) -> str:
pass
@abstractmethod
def test_cmd(self) -> str:
pass
@abstractmethod
def clean_cmd(self) -> str:
pass
@abstractmethod
def container_name(self) -> str:
pass
class MavenHandler(BuildHandler):
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
super().__init__(repo_path, build_file, updates)
self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
# -Dstyle.color=never: Disables ANSI colors.
# -Dartifact.download.skip=true: Prevents Maven from printing download logs (but still downloads dependencies when needed).
def compile_cmd(self) -> str:
return f"{self.base_cmd} clean compile"
def test_cmd(self) -> str:
return f"{self.base_cmd} test"
def clean_cmd(self) -> str:
return f"{self.base_cmd} clean"
def container_name(self) -> str:
return "crab-maven"
class GradleHandler(BuildHandler):
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
super().__init__(repo_path, build_file, updates)
self.base_cmd = "gradle --no-daemon --console=plain"
def compile_cmd(self) -> str:
return f"{self.base_cmd} compileJava"
def test_cmd(self) -> str:
return f"{self.base_cmd} test"
def clean_cmd(self) -> str:
return f"{self.base_cmd} clean"
def container_name(self) -> str:
return "crab-gradle"
def merge_download_lines(lines: list) -> list:
"""
Merges lines that are part of the same download block in Maven output.
Args:
lines (list): The lines to merge.
Returns:
list: The merged lines.
"""
downloading_block = False
cleaned_lines = []
for line in lines:
if re.match(r"\[INFO\] Download(ing|ed) from", line):
if not downloading_block:
cleaned_lines.append("[CRAB] Downloading stuff")
downloading_block = True
else:
cleaned_lines.append(line)
downloading_block = False
return cleaned_lines
def merge_unapproved_licences(lines: list) -> list:
"""
Merges lines that are part of the same unapproved licences block in Maven output.
Args:
lines (list): The lines to merge.
Returns:
list: The merged lines.
"""
licenses_block = False
cleaned_lines = []
for line in lines:
if re.match(r"\[WARNING\] Files with unapproved licenses:", line):
cleaned_lines.append(line)
cleaned_lines.append("[CRAB] List of all the unapproved licenses...")
licenses_block = True
elif licenses_block and not re.match(r"\s+\?\/\.m2\/repository", line):
licenses_block = False
if not licenses_block:
cleaned_lines.append(line)
return cleaned_lines
def clean_output(output: bytes) -> str:
output_lines = output.decode().split("\n")
cleaned_lines = merge_download_lines(output_lines)
cleaned_lines = merge_unapproved_licences(cleaned_lines)
return "\n".join(cleaned_lines)