mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 05:28:13 +02:00
extracted the logic of handling maven or gradle to
a simple class structure
This commit is contained in:
201
clone_repos.py
201
clone_repos.py
@ -1,24 +1,18 @@
|
||||
import pandas as pd
|
||||
import argparse, os, sys, subprocess, docker, re
|
||||
import argparse, os, sys, subprocess, docker
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
from typing import Optional
|
||||
|
||||
from handlers import GradleHandler, MavenHandler, BuildHandler
|
||||
|
||||
tqdm.pandas()
|
||||
|
||||
USER_ID = os.getuid() # for container user
|
||||
GROUP_ID = os.getgid()
|
||||
|
||||
EXCLUSION_LIST = [
|
||||
"edmcouncil/idmp", # requires authentication
|
||||
"aosp-mirror/platform_frameworks_base", # takes ages to clone
|
||||
]
|
||||
|
||||
GRADLE_BASE_CMD = "gradle --no-daemon --console=plain"
|
||||
MAVEN_BASE_CMD = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
|
||||
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
|
||||
# -Dstyle.color=never: Disables ANSI colors.
|
||||
# -Dartifact.download.skip=true: Prevents Maven from printing download logs (but still downloads dependencies when needed).
|
||||
|
||||
def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
|
||||
"""
|
||||
Clones a GitHub repository into a local directory.
|
||||
@ -49,7 +43,7 @@ def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: boo
|
||||
else:
|
||||
updates["cloned_successfully"] = True
|
||||
|
||||
def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
|
||||
def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
|
||||
"""
|
||||
Get the path to the build file of a repository. The build file is either a
|
||||
`pom.xml`, `build.gradle`, or `build.xml` file.
|
||||
@ -76,9 +70,10 @@ def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
|
||||
updates["depth_of_build_file"] = 0
|
||||
if entry.name == "build.gradle":
|
||||
updates["build_system"] = "gradle"
|
||||
return GradleHandler(path, entry.name, updates)
|
||||
else:
|
||||
updates["build_system"] = "maven"
|
||||
return os.path.join(path, entry.name)
|
||||
return MavenHandler(path, entry.name, updates)
|
||||
|
||||
# List files in the immediate subdirectories
|
||||
for entry in os.scandir(path):
|
||||
@ -89,41 +84,14 @@ def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
|
||||
updates["depth_of_build_file"] = 1
|
||||
if entry.name == "build.gradle":
|
||||
updates["build_system"] = "gradle"
|
||||
return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
|
||||
else:
|
||||
updates["build_system"] = "maven"
|
||||
return os.path.join(path, entry.name, sub_entry.name)
|
||||
return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
|
||||
|
||||
updates["error_msg"] = "No build file found"
|
||||
return None
|
||||
|
||||
def has_tests(path: str, build_file: str, updates: dict) -> bool:
|
||||
with open(build_file, "r") as f:
|
||||
content = f.read()
|
||||
|
||||
for library in ["junit", "testng", "mockito"]:
|
||||
if library in content:
|
||||
updates["detected_source_of_tests"] = library + " library in build file"
|
||||
return True
|
||||
|
||||
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
|
||||
if keyword in content:
|
||||
updates["detected_source_of_tests"] = keyword + " keyword in build file"
|
||||
return False
|
||||
|
||||
test_dirs = [
|
||||
"src/test/java",
|
||||
"src/test/kotlin",
|
||||
"src/test/groovy",
|
||||
"test",
|
||||
]
|
||||
for td in test_dirs:
|
||||
if os.path.exists(os.path.join(path, td)):
|
||||
updates["detected_source_of_tests"] = td + " dir exists in repo"
|
||||
return True
|
||||
|
||||
updates["error_msg"] = "No tests found"
|
||||
return False
|
||||
|
||||
def remove_dir(dir: str) -> None:
|
||||
"""
|
||||
Removes a directory and all its contents. Removes parent directorie if it is empty after removing child (dir).
|
||||
@ -136,113 +104,6 @@ def remove_dir(dir: str) -> None:
|
||||
if os.listdir(parent) == []:
|
||||
shutil.rmtree(parent)
|
||||
|
||||
def merge_download_lines(lines: list) -> list:
|
||||
"""
|
||||
Merges lines that are part of the same download block in Maven output.
|
||||
|
||||
Args:
|
||||
lines (list): The lines to merge.
|
||||
|
||||
Returns:
|
||||
list: The merged lines.
|
||||
"""
|
||||
downloading_block = False
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
if re.match(r"\[INFO\] Download(ing|ed) from", line):
|
||||
if not downloading_block:
|
||||
cleaned_lines.append("[CRAB] Downloading stuff")
|
||||
downloading_block = True
|
||||
else:
|
||||
cleaned_lines.append(line)
|
||||
downloading_block = False
|
||||
return cleaned_lines
|
||||
|
||||
def merge_unapproved_licences(lines: list) -> list:
|
||||
"""
|
||||
Merges lines that are part of the same unapproved licences block in Maven output.
|
||||
|
||||
Args:
|
||||
lines (list): The lines to merge.
|
||||
|
||||
Returns:
|
||||
list: The merged lines.
|
||||
"""
|
||||
licenses_block = False
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
if re.match(r"\[WARNING\] Files with unapproved licenses:", line):
|
||||
cleaned_lines.append(line)
|
||||
cleaned_lines.append("[CRAB] List of all the unapproved licenses...")
|
||||
licenses_block = True
|
||||
elif licenses_block and not re.match(r"\s+\?\/\.m2\/repository", line):
|
||||
licenses_block = False
|
||||
|
||||
if not licenses_block:
|
||||
cleaned_lines.append(line)
|
||||
return cleaned_lines
|
||||
|
||||
def clean_output(output: bytes) -> str:
|
||||
output_lines = output.decode().split("\n")
|
||||
|
||||
cleaned_lines = merge_download_lines(output_lines)
|
||||
cleaned_lines = merge_unapproved_licences(cleaned_lines)
|
||||
|
||||
return "\n".join(cleaned_lines)
|
||||
|
||||
def compile_repo(build_file: str, container, updates: dict) -> bool:
|
||||
"""
|
||||
Attempts to compile a repository inside a running Docker container.
|
||||
"""
|
||||
if build_file.endswith("pom.xml"):
|
||||
build_cmd = f"{MAVEN_BASE_CMD} clean compile"
|
||||
elif build_file.endswith("build.gradle"):
|
||||
build_cmd = f"{GRADLE_BASE_CMD} compileJava"
|
||||
else:
|
||||
updates["error_msg"] = "Unsupported build system for compiling: " + build_file
|
||||
return False
|
||||
|
||||
exec_result = container.exec_run(build_cmd)
|
||||
output = clean_output(exec_result.output)
|
||||
if exec_result.exit_code != 0:
|
||||
updates["compiled_successfully"] = False
|
||||
updates["error_msg"] = output
|
||||
return False
|
||||
|
||||
updates["compiled_successfully"] = True
|
||||
return True
|
||||
|
||||
def test_repo(build_file: str, container, updates: dict) -> bool:
|
||||
if build_file.endswith("pom.xml"):
|
||||
test_cmd = f"{MAVEN_BASE_CMD} test"
|
||||
elif build_file.endswith("build.gradle"):
|
||||
test_cmd = f"{GRADLE_BASE_CMD} test"
|
||||
else:
|
||||
updates["error_msg"] = "Unsupported build system for testing: " + build_file
|
||||
return False
|
||||
|
||||
exec_result = container.exec_run(test_cmd)
|
||||
output = clean_output(exec_result.output)
|
||||
if exec_result.exit_code != 0:
|
||||
updates["tested_successfully"] = False
|
||||
updates["error_msg"] = output
|
||||
return False
|
||||
|
||||
updates["tested_successfully"] = True
|
||||
updates["error_msg"] = output
|
||||
|
||||
return True
|
||||
|
||||
def clean_repo(build_file: str, container):
|
||||
if build_file.endswith("pom.xml"):
|
||||
clean_cmd = f"{MAVEN_BASE_CMD} clean"
|
||||
elif build_file.endswith("build.gradle"):
|
||||
clean_cmd = f"{GRADLE_BASE_CMD} clean"
|
||||
else:
|
||||
return
|
||||
|
||||
container.exec_run(clean_cmd)
|
||||
|
||||
def process_row(repo, client, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
|
||||
updates["good_repo_for_crab"] = False
|
||||
with tqdm(total=5, leave=False) as pbar:
|
||||
@ -261,55 +122,41 @@ def process_row(repo, client, dest: str, updates: dict, force: bool = False, ver
|
||||
updates["error_msg"] = "Repo not cloned"
|
||||
return
|
||||
|
||||
pbar.set_postfix_str("Getting build file...")
|
||||
build_file = get_build_file(dest, repo, updates)
|
||||
if build_file is None:
|
||||
pbar.set_postfix_str("Getting build handler...")
|
||||
build_handler = get_build_handler(dest, repo, updates)
|
||||
if build_handler is None:
|
||||
if verbose: print(f"Removing {repo}, no build file")
|
||||
remove_dir(repo_path)
|
||||
return
|
||||
pbar.update(1)
|
||||
|
||||
pbar.set_postfix_str("Checking for tests...")
|
||||
if not has_tests(repo_path, build_file, updates):
|
||||
if verbose: print(f"Removing {repo}, no test suites")
|
||||
remove_dir(repo_path)
|
||||
return
|
||||
if verbose: print(f"Keeping {repo}")
|
||||
pbar.update(1)
|
||||
|
||||
container = client.containers.run(
|
||||
image="crab-java-env",
|
||||
command="tail -f /dev/null",
|
||||
volumes={os.path.abspath(repo_path): {"bind": "/repo", "mode": "rw"}},
|
||||
user=f"{USER_ID}:{GROUP_ID}",
|
||||
detach=True,
|
||||
tty=True
|
||||
)
|
||||
build_handler.set_client(client)
|
||||
with build_handler:
|
||||
pbar.set_postfix_str("Checking for tests...")
|
||||
if not build_handler.has_tests():
|
||||
if verbose: print(f"Removing {repo}, no test suites")
|
||||
remove_dir(repo_path)
|
||||
return
|
||||
if verbose: print(f"Keeping {repo}")
|
||||
pbar.update(1)
|
||||
|
||||
try:
|
||||
pbar.set_postfix_str("Compiling...")
|
||||
compiled = compile_repo(build_file, container, updates)
|
||||
if not compiled:
|
||||
if not build_handler.compile_repo():
|
||||
if verbose: print(f"Removing {repo}, failed to compile")
|
||||
clean_repo(build_file, container)
|
||||
remove_dir(repo_path)
|
||||
return
|
||||
pbar.update(1)
|
||||
|
||||
pbar.set_postfix_str("Running tests...")
|
||||
tested = test_repo(build_file, container, updates)
|
||||
clean_repo(build_file, container)
|
||||
if not tested:
|
||||
if not build_handler.test_repo():
|
||||
if verbose: print(f"Removing {repo}, failed to run tests")
|
||||
remove_dir(repo_path)
|
||||
return
|
||||
build_handler.clean_repo()
|
||||
pbar.update(1)
|
||||
|
||||
# If repo was not removed, then it is a good repo
|
||||
updates["good_repo_for_crab"] = True
|
||||
finally:
|
||||
container.kill()
|
||||
container.remove()
|
||||
|
||||
def save_df_with_updates(df, updates_list, verbose=False):
|
||||
# Create columns for the new data
|
||||
|
194
handlers.py
Normal file
194
handlers.py
Normal file
@ -0,0 +1,194 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import os, re, docker
|
||||
|
||||
USER_ID = os.getuid() # for container user
|
||||
GROUP_ID = os.getgid()
|
||||
|
||||
class BuildHandler(ABC):
|
||||
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
|
||||
super().__init__()
|
||||
self.path: str = repo_path
|
||||
# self.container: Optional[Container] = None
|
||||
self.build_file: str = build_file
|
||||
self.updates = updates
|
||||
|
||||
def set_client(self, client: docker.DockerClient):
|
||||
self.client = client
|
||||
|
||||
def __enter__(self):
|
||||
self.container = self.client.containers.run(
|
||||
image=self.container_name(),
|
||||
command="tail -f /dev/null", # to keep the container alive
|
||||
volumes={os.path.abspath(self.path): {"bind": "/repo", "mode": "rw"}},
|
||||
user=f"{USER_ID}:{GROUP_ID}",
|
||||
detach=True,
|
||||
tty=True
|
||||
)
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.container.kill()
|
||||
self.container.remove()
|
||||
|
||||
|
||||
def has_tests(self) -> bool:
|
||||
with open(os.path.join(self.path, self.build_file), "r") as f:
|
||||
content = f.read()
|
||||
|
||||
for library in ["junit", "testng", "mockito"]:
|
||||
if library in content:
|
||||
self.updates["detected_source_of_tests"] = library + " library in build file"
|
||||
return True
|
||||
|
||||
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
|
||||
if keyword in content:
|
||||
self.updates["detected_source_of_tests"] = keyword + " keyword in build file"
|
||||
return False
|
||||
|
||||
test_dirs = [
|
||||
"src/test/java",
|
||||
"src/test/kotlin",
|
||||
"src/test/groovy",
|
||||
"test",
|
||||
]
|
||||
for td in test_dirs:
|
||||
if os.path.exists(os.path.join(self.path, td)):
|
||||
self.updates["detected_source_of_tests"] = td + " dir exists in repo"
|
||||
return True
|
||||
|
||||
self.updates["error_msg"] = "No tests found"
|
||||
return False
|
||||
|
||||
def compile_repo(self) -> bool:
|
||||
exec_result = self.container.exec_run(self.compile_cmd())
|
||||
output = clean_output(exec_result.output)
|
||||
if exec_result.exit_code != 0:
|
||||
self.updates["compiled_successfully"] = False
|
||||
self.updates["error_msg"] = output
|
||||
return False
|
||||
|
||||
self.updates["compiled_successfully"] = True
|
||||
return True
|
||||
|
||||
def test_repo(self) -> bool:
|
||||
exec_result = self.container.exec_run(self.test_cmd())
|
||||
output = clean_output(exec_result.output)
|
||||
if exec_result.exit_code != 0:
|
||||
self.updates["tested_successfully"] = False
|
||||
self.updates["error_msg"] = output
|
||||
return False
|
||||
|
||||
self.updates["tested_successfully"] = True
|
||||
self.updates["error_msg"] = output
|
||||
|
||||
return True
|
||||
|
||||
def clean_repo(self) -> None:
|
||||
self.container.exec_run(self.clean_cmd())
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def compile_cmd(self) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def test_cmd(self) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clean_cmd(self) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def container_name(self) -> str:
|
||||
pass
|
||||
|
||||
class MavenHandler(BuildHandler):
|
||||
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
|
||||
super().__init__(repo_path, build_file, updates)
|
||||
self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
|
||||
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
|
||||
# -Dstyle.color=never: Disables ANSI colors.
|
||||
# -Dartifact.download.skip=true: Prevents Maven from printing download logs (but still downloads dependencies when needed).
|
||||
|
||||
def compile_cmd(self) -> str:
|
||||
return f"{self.base_cmd} clean compile"
|
||||
|
||||
def test_cmd(self) -> str:
|
||||
return f"{self.base_cmd} test"
|
||||
|
||||
def clean_cmd(self) -> str:
|
||||
return f"{self.base_cmd} clean"
|
||||
|
||||
def container_name(self) -> str:
|
||||
return "crab-maven"
|
||||
|
||||
class GradleHandler(BuildHandler):
|
||||
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
|
||||
super().__init__(repo_path, build_file, updates)
|
||||
self.base_cmd = "gradle --no-daemon --console=plain"
|
||||
|
||||
def compile_cmd(self) -> str:
|
||||
return f"{self.base_cmd} compileJava"
|
||||
|
||||
def test_cmd(self) -> str:
|
||||
return f"{self.base_cmd} test"
|
||||
|
||||
def clean_cmd(self) -> str:
|
||||
return f"{self.base_cmd} clean"
|
||||
|
||||
def container_name(self) -> str:
|
||||
return "crab-gradle"
|
||||
|
||||
def merge_download_lines(lines: list) -> list:
|
||||
"""
|
||||
Merges lines that are part of the same download block in Maven output.
|
||||
|
||||
Args:
|
||||
lines (list): The lines to merge.
|
||||
|
||||
Returns:
|
||||
list: The merged lines.
|
||||
"""
|
||||
downloading_block = False
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
if re.match(r"\[INFO\] Download(ing|ed) from", line):
|
||||
if not downloading_block:
|
||||
cleaned_lines.append("[CRAB] Downloading stuff")
|
||||
downloading_block = True
|
||||
else:
|
||||
cleaned_lines.append(line)
|
||||
downloading_block = False
|
||||
return cleaned_lines
|
||||
|
||||
def merge_unapproved_licences(lines: list) -> list:
|
||||
"""
|
||||
Merges lines that are part of the same unapproved licences block in Maven output.
|
||||
|
||||
Args:
|
||||
lines (list): The lines to merge.
|
||||
|
||||
Returns:
|
||||
list: The merged lines.
|
||||
"""
|
||||
licenses_block = False
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
if re.match(r"\[WARNING\] Files with unapproved licenses:", line):
|
||||
cleaned_lines.append(line)
|
||||
cleaned_lines.append("[CRAB] List of all the unapproved licenses...")
|
||||
licenses_block = True
|
||||
elif licenses_block and not re.match(r"\s+\?\/\.m2\/repository", line):
|
||||
licenses_block = False
|
||||
|
||||
if not licenses_block:
|
||||
cleaned_lines.append(line)
|
||||
return cleaned_lines
|
||||
|
||||
def clean_output(output: bytes) -> str:
|
||||
output_lines = output.decode().split("\n")
|
||||
|
||||
cleaned_lines = merge_download_lines(output_lines)
|
||||
cleaned_lines = merge_unapproved_licences(cleaned_lines)
|
||||
|
||||
return "\n".join(cleaned_lines)
|
Reference in New Issue
Block a user