added the dataset and the handlers from the

dataset_builder and started modifying them for the webapp
2025-09-06 10:07:54 +02:00 · 2025-05-14 15:55:14 +02:00
parent 741c2a95e6
commit c49ed9608a
2 changed files with 823 additions and 0 deletions
--- a/src/utils/dataset.py
+++ b/src/utils/dataset.py
@@ -0,0 +1,201 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+import json, uuid
+
+# fmt: off
+@dataclass
+class FileData:
+    is_code_related: bool
+    coverage: Dict[str, float] # jacoco-report -> coverage
+    content_before_pr: str = ""
+    content_after_pr: str = ""
+
+@dataclass
+class Comment:
+    body: str
+    file: str
+    from_: int
+    to: int
+    paraphrases: List[str] = field(default_factory=list)
+
+@dataclass
+class Selection:
+    comment_suggests_change: bool
+    diff_after_address_change: Optional[bool]
+    is_code_related: bool
+
+class ArchiveState(Enum):
+    BASE = "base"
+    MERGED = "merged"
+
+@dataclass
+class Metadata:
+    id: str
+    repo: str   # the name of the repo, with style XXX/YYY
+    pr_number: int
+    pr_title: str
+    pr_body: str
+    merge_commit_sha: str   # to checkout for the tests
+    successful: bool = True
+    build_system: str = ""
+    reason_for_failure: str = ""
+    last_cmd_error_msg: str = ""
+    selection: Optional[Selection] = None
+
+    def archive_name(self, state: ArchiveState, only_id:bool=False):
+        if only_id:
+            return f"{self.id}_{state.value}.tar.gz"
+        return f"{self.repo.replace('/', '_')}_{self.pr_number}_{state.value}.tar.gz"
+
+@dataclass
+class DatasetEntry:
+    metadata: Metadata
+    files: Dict[str, FileData]   # filename -> file data, files before the PR (before the first PR commits)
+    diffs_before: Dict[str, str]   # filename -> diff, diffs between the opening of the PR and the comment
+    comments: List[Comment]
+    diffs_after: Dict[str, str]   # filename -> diff, changes after the comment
+
+
+@dataclass
+class CommentGenEntry:
+    id: str
+    files: Dict[str, str]   # filename -> file content
+    diffs: Dict[str, str]   # filename -> diff, diffs between the opening of the PR and the comment
+
+    @staticmethod
+    def from_entry(entry: DatasetEntry) -> "CommentGenEntry":
+        return CommentGenEntry(
+            id=entry.metadata.id,
+            files={fname: fdata.content_before_pr for fname, fdata in entry.files.items()},
+            diffs=entry.diffs_before,
+        )
+
+
+@dataclass
+class CodeRefinementEntry:
+    id: str
+    files: Dict[str, str]   # filename -> file content
+    diffs: Dict[str, str]   # filename -> diff, diffs between the opening of the PR and the comment
+    comments: List[Comment]
+
+    @staticmethod
+    def from_entry(entry: DatasetEntry) -> "CodeRefinementEntry":
+        return CodeRefinementEntry(
+            id=entry.metadata.id,
+            files={fname: fdata.content_before_pr for fname, fdata in entry.files.items()},
+            diffs=entry.diffs_before,
+            comments=entry.comments,
+        )
+
+class OutputType(Enum):
+    FULL = "full"
+    CODE_REFINEMENT = "code_refinement"
+    COMMENT_GEN = "comment_gen"
+
+# fmt: on
+@dataclass
+class Dataset:
+    entries: List[DatasetEntry] = field(default_factory=list)
+
+    def __len__(self) -> int:
+        return sum(1 for entry in self.entries if entry.metadata.successful)
+
+    def to_json(
+        self,
+        filename: str,
+        type_: OutputType = OutputType.FULL,
+        remove_non_suggesting: bool = False,
+    ) -> None:
+        """Serialize the dataset to a JSON file"""
+
+        entries_to_dump = self.entries
+
+        if type_ == OutputType.COMMENT_GEN:
+            entries_to_dump = [
+                entry
+                for entry in self.entries
+                if entry.metadata.selection and entry.metadata.selection.comment_suggests_change
+            ]
+        elif type_ == OutputType.CODE_REFINEMENT:
+            entries_to_dump = [
+                entry
+                for entry in self.entries
+                if entry.metadata.selection
+                and entry.metadata.selection.diff_after_address_change
+                and entry.metadata.selection.is_code_related
+            ]
+        elif type_ == OutputType.FULL and remove_non_suggesting:
+            entries_to_dump = [
+                entry
+                for entry in self.entries
+                if entry.metadata.selection and entry.metadata.selection.comment_suggests_change
+            ]
+
+        to_dump = Dataset(entries=entries_to_dump)
+        # print(f"{len(entries_to_dump)} entries...", end=" ", flush=True)
+
+        def transform_entry(entry: Union[DatasetEntry, Dataset, Any]) -> Union[dict, list]:
+            if not isinstance(entry, (DatasetEntry, Dataset)):
+                return entry.__dict__
+
+            if type_ == OutputType.FULL:
+                return entry.__dict__
+
+            if isinstance(entry, Dataset):
+                return entry.entries
+
+            if type_ == OutputType.COMMENT_GEN:
+                return CommentGenEntry.from_entry(entry).__dict__
+
+            if type_ == OutputType.CODE_REFINEMENT:
+                return CodeRefinementEntry.from_entry(entry).__dict__
+
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(to_dump, f, default=transform_entry, indent=4)
+
+    @staticmethod
+    def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
+        with open(filename, "r", encoding="utf-8") as f:
+            print(f"Loading dataset from {filename}...", end=" ", flush=True)
+            data = json.load(f)
+            print("Done")
+
+        entries = []
+        for entry_data in data["entries"]:
+            metadata_data = entry_data["metadata"]
+            selection_data = metadata_data["selection"] if "selection" in metadata_data else None
+            selection = Selection(**selection_data) if selection_data else None
+            metadata_data["selection"] = selection
+            if "id" not in metadata_data:
+                metadata_data["id"] = uuid.uuid4().hex
+            metadata = Metadata(**metadata_data)
+
+            if (
+                not keep_still_in_progress
+                and metadata.reason_for_failure == "Was still being processed"
+            ):
+                continue
+
+            files = {fname: FileData(**fdata) for fname, fdata in entry_data["files"].items()}
+
+            comments = [Comment(**comment) for comment in entry_data["comments"]]
+
+            entry = DatasetEntry(
+                metadata=metadata,
+                files=files,
+                diffs_before=entry_data["diffs_before"],
+                comments=comments,
+                diffs_after=entry_data["diffs_after"],
+            )
+            entries.append(entry)
+
+        return Dataset(entries=entries)
+
+    def build_reference_map(self) -> Dict[str, DatasetEntry]:
+        """Build a reference map for the dataset"""
+
+        ref_map = {}
+        for entry in self.entries:
+            ref_map[entry.metadata.id] = entry
+        return ref_map
--- a/src/utils/handlers.py
+++ b/src/utils/handlers.py
@@ -0,0 +1,622 @@
+from abc import ABC, abstractmethod
+import os, re, docker, signal, javalang
+from bs4 import BeautifulSoup
+from typing import Iterable, Tuple, Iterator
+import xml.etree.ElementTree as ET
+from javalang.tree import PackageDeclaration
+import tarfile
+import tempfile
+from shutil import rmtree
+
+REPORT_SIZE_THRESHOLD = 400   # less than 400 bytes (charcaters), we don't care about it
+
+
+USER_ID = os.getuid()   # for container user
+GROUP_ID = os.getgid()
+
+
+class BuildHandler(ABC):
+    def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
+        super().__init__()
+        self.path: str = repo_path
+        self.build_file: str = build_file
+        self.updates = updates
+
+    def set_client(self, client: docker.DockerClient):
+        self.client = client
+
+    def __enter__(self):
+        self.container = self.client.containers.run(
+            image=self.container_name(),
+            command="tail -f /dev/null",  # to keep the container alive
+            volumes={os.path.abspath(self.path): {"bind": "/repo", "mode": "rw"}},
+            user=f"{USER_ID}:{GROUP_ID}",
+            detach=True,
+            tty=True,
+        )
+
+    def __exit__(self, *args):
+        self.container.kill()
+        self.container.remove()
+        rmtree(self.path)
+
+    def compile_repo(self) -> None:
+        def timeout_handler(signum, frame):
+            raise TimeoutError("Tests exceeded time limit")
+
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(3600)  # Set timeout to 1 hour (3600 seconds)
+
+        try:
+            exec_result = self.container.exec_run(self.compile_cmd())
+            output = clean_output(exec_result.output)
+            if exec_result.exit_code != 0:
+                raise FailedToCompileError(output)
+        except TimeoutError:
+            self.updates["compiled_successfully"] = False
+            self.updates[
+                "error_msg"
+            ] = "Compile process killed due to exceeding the 1-hour time limit"
+        finally:
+            signal.alarm(0)  # Cancel the alarm
+
+    def test_repo(self) -> None:
+        def timeout_handler(signum, frame):
+            raise TimeoutError("Tests exceeded time limit")
+
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(3600)  # Set timeout to 1 hour (3600 seconds)
+
+        try:
+            exec_result = self.container.exec_run(self.test_cmd())
+            output = clean_output(exec_result.output)
+            if exec_result.exit_code != 0:
+                raise FailedToTestError(output)
+
+            self.extract_test_numbers(output)
+
+        except TimeoutError:
+            self.updates["tested_successfully"] = False
+            self.updates["error_msg"] = "Test process killed due to exceeding the 1-hour time limit"
+            return
+
+        finally:
+            signal.alarm(0)  # Cancel the alarm
+
+    def generate_coverage_report(self, already_injected_manually: bool = False):
+        result = self.container.exec_run(self.generate_coverage_report_cmd())
+        if result.exit_code != 0:
+            if already_injected_manually:
+                raise CantExecJacoco(clean_output(result.output))
+
+            build_file_path = os.path.join(self.path, self.build_file)
+            if not os.path.exists(build_file_path):
+                raise CantInjectJacoco("pom.xml not found")
+            with open(build_file_path, "r") as f:
+                og_content = f.read()
+            try:
+                self._try_to_inject_jacoco(build_file_path)
+                self.generate_coverage_report(already_injected_manually=True)
+            except (CantInjectJacoco, CantExecJacoco) as e:
+                with open(build_file_path, "w") as f:
+                    f.write(og_content)
+                    raise e
+
+    @abstractmethod
+    def _try_to_inject_jacoco(self, build_file_path: str) -> None:
+        pass
+
+    def check_coverage(self, filename: str) -> Iterator[Tuple[str, float]]:
+        """
+        Check if the given filename is covered by JaCoCo.
+        """
+        found_at_least_one = False
+        candidates = []
+        for coverage_report_path in self.get_jacoco_report_paths():
+            if not os.path.exists(coverage_report_path):
+                raise NoCoverageReportFound(
+                    f"Coverage report file '{coverage_report_path}' does not exist"
+                )
+
+            fully_qualified_class = self._extract_fully_qualified_class(filename)
+            candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class})
+            # if coverage_report_path[:len(src_dir)] != src_dir:
+            #     continue
+            coverage = get_coverage_for_file(
+                coverage_report_path, fully_qualified_class, os.path.basename(filename)
+            )
+            if coverage != -1:
+                found_at_least_one = True
+                yield coverage_report_path, coverage
+
+        if not found_at_least_one:
+            raise FileNotCovered(
+                f"File '{filename}' didn't have any coverage in any of the jacoco reports: {candidates}"
+            )
+
+    def _extract_fully_qualified_class(self, filepath: str) -> str:
+        if not filepath.endswith('.java'):
+            raise NotJavaFileError(f"File '{filepath}' does not end with .java")
+
+        if not os.path.exists(os.path.join(self.path, filepath)):
+            raise FileNotFoundInRepoError(f"File '{filepath}' not found in repo")
+
+        with open(os.path.join(self.path, filepath)) as f:
+            try:
+                parsed_tree = javalang.parse.parse(f.read())
+            except javalang.parser.JavaSyntaxError as e:
+                raise NotJavaFileError(
+                    f"File '{filepath}' has a syntax error and could not be parsed by javalang, raised error: '{e}'"
+                )
+
+            package_name = None
+            for _, node in parsed_tree.filter(PackageDeclaration):
+                package_name = node.name   # type: ignore
+                break  # Stop after finding the first package declaration
+
+            if package_name is None:
+                raise NoPackageFoundError(
+                    f"File '{filepath}' did not have a packaged name recognized by javalang"
+                )
+
+            fully_qualified_class = package_name.replace('.', '/')
+            # src_dir = filepath[:filepath.index(fully_qualified_class)]
+            fully_qualified_class += "/" + os.path.basename(filepath)[:-5]   # -5 to remove '.java'
+            return fully_qualified_class
+
+    def clean_repo(self) -> None:
+        self.container.exec_run(self.clean_cmd())
+
+    @abstractmethod
+    def get_type(self) -> str:
+        pass
+
+    @abstractmethod
+    def compile_cmd(self) -> str:
+        pass
+
+    @abstractmethod
+    def test_cmd(self) -> str:
+        pass
+
+    @abstractmethod
+    def extract_test_numbers(self, output: str) -> None:
+        pass
+
+    @abstractmethod
+    def clean_cmd(self) -> str:
+        pass
+
+    @abstractmethod
+    def generate_coverage_report_cmd(self) -> str:
+        pass
+
+    @abstractmethod
+    def get_jacoco_report_paths(self) -> Iterable[str]:
+        pass
+
+    @abstractmethod
+    def container_name(self) -> str:
+        pass
+
+
+class MavenHandler(BuildHandler):
+    def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
+        super().__init__(repo_path, build_file, updates)
+        self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
+        # -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
+        # -Dstyle.color=never: Disables ANSI colors.
+        # -Dartifact.download.skip=true: Prevents Maven from printing download logs (but still downloads dependencies when needed).
+
+    def get_type(self) -> str:
+        return "maven"
+
+    def compile_cmd(self) -> str:
+        return f"{self.base_cmd} clean compile"
+
+    def test_cmd(self) -> str:
+        return f"{self.base_cmd} test"
+
+    def clean_cmd(self) -> str:
+        return f"{self.base_cmd} clean"
+
+    def generate_coverage_report_cmd(self):
+        return f"{self.base_cmd} jacoco:report-aggregate"
+
+    def container_name(self) -> str:
+        return "crab-maven"
+
+    def extract_test_numbers(self, output: str) -> None:
+        pattern = r"\[INFO\] Results:\n\[INFO\]\s*\n\[INFO\] Tests run: (\d+), Failures: (\d+), Errors: (\d+), Skipped: (\d+)"
+
+        matches = re.findall(pattern, output)
+
+        self.updates["n_tests"] = 0
+        self.updates["n_tests_passed"] = 0  # Passed tests = Tests run - (Failures + Errors)
+        self.updates["n_tests_failed"] = 0
+        self.updates["n_tests_errors"] = 0
+        self.updates["n_tests_skipped"] = 0
+
+        if len(matches) == 0:
+            raise NoTestResultsToExtractError("No test results found in Maven output:\n" + output)
+
+        for match in matches:
+            tests_run, failures, errors, skipped = map(int, match)
+            self.updates["n_tests"] += tests_run
+            self.updates["n_tests_failed"] += failures
+            self.updates["n_tests_errors"] += errors
+            self.updates["n_tests_skipped"] += skipped
+            self.updates["n_tests_passed"] += tests_run - (
+                failures + errors
+            )  # Calculate passed tests
+
+    def get_jacoco_report_paths(self) -> Iterable[str]:
+        found_at_least_one = False
+        for root, _, files in os.walk(os.path.join(self.path)):
+            if "target/site" not in root:
+                continue   # to avoid any misleading jacoco.xml randomly lying around
+            for file in files:
+                if file == "jacoco.xml":
+                    found_at_least_one = True
+                    yield os.path.join(root, file)
+        if not found_at_least_one:
+            raise NoCoverageReportFound(f"Couldn't find any 'jacoco.xml' in {self.path}")
+
+    def _try_to_inject_jacoco(self, build_file_path: str) -> None:
+        with open(build_file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        if "<artifactId>jacoco-maven-plugin</artifactId>" in content:
+            return   # already present
+
+        jacoco_plugin = """
+    <plugin>
+        <groupId>org.jacoco</groupId>
+        <artifactId>jacoco-maven-plugin</artifactId>
+        <version>0.8.8</version>
+        <executions>
+            <execution>
+                <goals>
+                    <goal>prepare-agent</goal>
+                </goals>
+            </execution>
+            <execution>
+                <id>report</id>
+                <phase>test</phase>
+                <goals>
+                    <goal>report</goal>
+                </goals>
+            </execution>
+        </executions>
+    </plugin>
+"""
+
+        if "<plugins>" in content:
+            # just insert inside existing plugins
+            content = content.replace("<plugins>", f"<plugins>\n{jacoco_plugin}")
+        elif "</project>" in content:
+            # plugins section doesn't exist, create full <build> section
+            build_block = f"""
+        <build>
+            <plugins>
+    {jacoco_plugin}
+            </plugins>
+        </build>
+    """
+            content = content.replace("</project>", f"{build_block}\n</project>")
+        else:
+            raise CantInjectJacoco("Could not find insertion point for plugins in pom.xml")
+
+        with open(build_file_path, "w", encoding="utf-8") as f:
+            f.write(content)
+
+
+class GradleHandler(BuildHandler):
+    def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
+        super().__init__(repo_path, build_file, updates)
+        self.base_cmd = "gradle --no-daemon --console=plain"
+
+    def get_type(self) -> str:
+        return "gradle"
+
+    def compile_cmd(self) -> str:
+        return f"{self.base_cmd} compileJava"
+
+    def test_cmd(self) -> str:
+        return f"{self.base_cmd} test"
+
+    def clean_cmd(self) -> str:
+        return f"{self.base_cmd} clean"
+
+    def generate_coverage_report_cmd(self) -> str:
+        return f"{self.base_cmd} jacocoTestReport"
+
+    def container_name(self) -> str:
+        return "crab-gradle"
+
+    def extract_test_numbers(self, output: str) -> None:
+        self.updates["n_tests"] = -1
+        self.updates["n_tests_passed"] = -1
+        self.updates["n_tests_failed"] = -1
+        self.updates["n_tests_errors"] = -1
+        self.updates["n_tests_skipped"] = -1
+
+        test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html")
+        if not os.path.exists(test_results_path):
+            raise NoTestResultsToExtractError(
+                "No test results found (prolly a repo with sub-projects)"
+            )
+
+        # Load the HTML file
+        with open(test_results_path, "r") as file:
+            soup = BeautifulSoup(file, "html.parser")
+
+            # test_div = soup.select_one("div", class_="infoBox", id="tests")
+            test_div = soup.select_one("div.infoBox#tests")
+            if test_div is None:
+                raise NoTestResultsToExtractError("No test results found (no div.infoBox#tests)")
+
+            # counter_div = test_div.find("div", class_="counter")
+            counter_div = test_div.select_one("div.counter")
+            if counter_div is None:
+                raise NoTestResultsToExtractError(
+                    "No test results found (not div.counter for tests)"
+                )
+
+            self.updates["n_tests"] = int(counter_div.text.strip())
+
+            # failures_div = soup.find("div", class_="infoBox", id="failures")
+            failures_div = soup.select_one("div.infoBox#failures")
+            if failures_div is None:
+                raise NoTestResultsToExtractError("No test results found (no div.infoBox#failures)")
+
+            # counter_div = failures_div.find("div", class_="counter")
+            counter_div = failures_div.select_one("div.counter")
+            if counter_div is None:
+                raise NoTestResultsToExtractError(
+                    "No test results found (not div.counter for failures)"
+                )
+
+            self.updates["n_tests_failed"] = int(counter_div.text.strip())
+
+            # Calculate passed tests
+            self.updates["n_tests_passed"] = (
+                self.updates["n_tests"] - self.updates["n_tests_failed"]
+            )
+
+    def get_jacoco_report_paths(self) -> Iterable[str]:
+        found_at_least_one = False
+        for root, _, files in os.walk(os.path.join(self.path)):
+            if "reports/jacoco" not in root:
+                continue
+            for file in files:
+                if file == "index.html":
+                    found_at_least_one = True
+                    yield os.path.join(root, file)
+        if not found_at_least_one:
+            raise NoCoverageReportFound(
+                f"Couldn't find any 'index.html' inside any 'reports/jacoco' in {self.path}"
+            )
+
+    def _try_to_inject_jacoco(self, build_file_path: str) -> None:
+        with open(build_file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        if "id 'jacoco'" in content or "apply plugin: 'jacoco'" in content:
+            return  # already present
+
+        jacoco_snippet = """
+plugins {
+    id 'jacoco'
+}
+
+jacoco {
+    toolVersion = "0.8.8"
+}
+
+test {
+    finalizedBy jacocoTestReport
+}
+
+jacocoTestReport {
+    dependsOn test
+    reports {
+        xml.required = true
+        html.required = true
+    }
+}"""
+
+        content = jacoco_snippet + "\n\n" + content
+
+        with open(build_file_path, "w", encoding="utf-8") as f:
+            f.write(content)
+
+
+class SetupException(Exception, ABC):
+    reason_for_failure: str
+
+
+class NotValidDirectory(SetupException):
+    reason_for_failure = "The directory is not valid"
+
+
+class CantFindBuildFile(SetupException):
+    reason_for_failure = "Couldn't find the build file in the directory"
+
+
+class HandlerException(Exception, ABC):
+    reason_for_failure = "Generic handler expection (this shouldn't appear)"
+
+
+class NoTestsFoundError(HandlerException):
+    reason_for_failure = "No tests found"
+
+
+class FailedToCompileError(HandlerException):
+    reason_for_failure = "Failed to compile"
+
+
+class FailedToTestError(HandlerException):
+    reason_for_failure = "Failed to test"
+
+
+class NoTestResultsToExtractError(HandlerException):
+    reason_for_failure = "Failed to extract test results"
+
+
+class CantExecJacoco(HandlerException):
+    reason_for_failure = "Couldn't execute jacoco"
+
+
+class CantInjectJacoco(HandlerException):
+    reason_for_failure = "Couldn't inject jacoco in the build file"
+
+
+class NoCoverageReportFound(HandlerException):
+    reason_for_failure = "No coverage report was found"
+
+
+class FileNotCovered(HandlerException):
+    reason_for_failure = "Commented file from the PR wasn't not covered"
+
+
+class GradleAggregateReportNotFound(HandlerException):
+    reason_for_failure = "Couldn't find the aggregate report (with gradle it's messy)"
+
+
+class NotJavaFileError(HandlerException):
+    reason_for_failure = "File that was checked for coverage was not java file"
+
+
+class NoPackageFoundError(HandlerException):
+    reason_for_failure = "Java file did not contain a valid package name"
+
+
+class FileNotFoundInRepoError(HandlerException):
+    reason_for_failure = "Commented file not found in repo (likely renamed or deleted)"
+
+
+def merge_download_lines(lines: list) -> list:
+    """
+    Merges lines that are part of the same download block in Maven output.
+
+    Args:
+        lines (list): The lines to merge.
+
+    Returns:
+        list: The merged lines.
+    """
+    downloading_block = False
+    cleaned_lines = []
+    for line in lines:
+        if re.match(r"\[INFO\] Download(ing|ed) from", line):
+            if not downloading_block:
+                cleaned_lines.append("[CRAB] Downloading stuff")
+                downloading_block = True
+        else:
+            cleaned_lines.append(line)
+            downloading_block = False
+    return cleaned_lines
+
+
+def merge_unapproved_licences(lines: list) -> list:
+    """
+    Merges lines that are part of the same unapproved licences block in Maven output.
+
+    Args:
+        lines (list): The lines to merge.
+
+    Returns:
+        list: The merged lines.
+    """
+    licenses_block = False
+    cleaned_lines = []
+    for line in lines:
+        if re.match(r"\[WARNING\] Files with unapproved licenses:", line):
+            cleaned_lines.append(line)
+            cleaned_lines.append("[CRAB] List of all the unapproved licenses...")
+            licenses_block = True
+        elif licenses_block and not re.match(r"\s+\?\/\.m2\/repository", line):
+            licenses_block = False
+
+        if not licenses_block:
+            cleaned_lines.append(line)
+    return cleaned_lines
+
+
+def clean_output(output: bytes) -> str:
+    output_lines = output.decode().split("\n")
+
+    cleaned_lines = merge_download_lines(output_lines)
+    cleaned_lines = merge_unapproved_licences(cleaned_lines)
+
+    return "\n".join(cleaned_lines)
+
+
+def get_coverage_for_file(xml_file: str, target_fully_qualified_class: str, basename: str) -> float:
+    # Parse the XML file
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+
+    # Find coverage for the target file
+    for package in root.findall(".//package"):
+        for class_ in package.findall("class"):
+            if (
+                class_.get("sourcefilename") == basename
+                and class_.get("name") == target_fully_qualified_class
+            ):
+                # Extract line coverage data
+                line_counter = class_.find("counter[@type='LINE']")
+                if line_counter is not None:
+                    counter = line_counter.get("missed")
+                    assert isinstance(counter, str)
+                    missed = int(counter)
+                    counter = line_counter.get("covered")
+                    assert isinstance(counter, str)
+                    covered = int(counter)
+                    total = missed + covered
+                    coverage = (covered / total) * 100 if total > 0 else 0
+                    return coverage
+    return -1
+
+
+def get_build_handler(root: str, repo: str, verbose: bool = False) -> BuildHandler:
+    """
+    Get a BuildHandler for a repository, where `repo` may be either:
+
+      - a directory under `root`, or
+      - a .tar.gz/.tgz file in `root` containing your repo
+
+    Returns:
+        an instance of GradleHandler or MavenHandler
+    """
+    path = os.path.join(root, repo)
+
+    # 1) If it's a tarball, extract it
+    if os.path.isfile(path) and tarfile.is_tarfile(path):
+        if verbose:
+            print(f"Archive detected: extracting {path}…")
+        path = tempfile.mkdtemp(prefix="repo_")
+        with tarfile.open(path, "r:gz") as tar:
+            tar.extractall(path)
+
+    # 2) Otherwise it must be a directory
+    elif not os.path.isdir(path):
+        raise NotValidDirectory(f"The path {path!r} is neither a directory nor a tar archive.")
+
+    # 3) Now scan for build files
+    to_keep = {"pom.xml", "build.gradle"}
+    for entry in os.scandir(path):
+        if entry.is_file() and entry.name in to_keep:
+            if verbose:
+                print(f"Found {entry.name!r} in {path!r}, returning handler")
+
+            if entry.name == "build.gradle":
+                return GradleHandler(path, entry.name)
+            else:
+                return MavenHandler(path, entry.name)
+
+    if os.path.exists(path) and os.path.isdir(path):
+        rmtree(path)
+
+    raise CantFindBuildFile(f"Could not find any of {sorted(to_keep)} in {path!r}")