from dataclasses import dataclass
from enum import Enum
import os, json, tarfile, argparse
from typing import Optional
from dataset import Dataset, ArchiveState
from utils import EnumChoicesAction


class OutputType(Enum):
    CODE_REFINEMENT = "code_refinement"
    COMMENT_GEN = "comment_gen"
    FOR_PARAPHRASES = "paraphrases"


@dataclass
class CommentGenSubmission:
    path: str
    line_from: int
    line_to: Optional[int]
    body: str


def extract_comment_for_paraphrases(dataset_path: str, output_path: str):
    dataset = Dataset.from_json(dataset_path)
    results: dict[str, dict] = {}

    for entry in dataset.entries:
        sel = entry.metadata.selection
        if sel and sel.comment_suggests_change:
            comment = entry.comments[0].__dict__
            del comment["paraphrases"]
            results[entry.metadata.id] = {
                "link": f"https://github.com/{entry.metadata.repo}/pull/{entry.metadata.pr_number}",
                "comment": comment,
                "files": {fname: fdata.content_before_pr for fname, fdata in entry.files.items()},
                "diffs_before": entry.diffs_before,
            }

    # Write out the exact predictions reference JSON
    with open(output_path, "w", encoding="utf-8") as out_file:
        json.dump(results, out_file, default=lambda o: o.__dict__, indent=4)

    print(f"Saved {len(results)} entries to {output_path}")


def extract_comment_predictions(dataset_path: str, output_path: str):
    dataset = Dataset.from_json(dataset_path)
    results: dict[str, CommentGenSubmission] = {}
    for entry in dataset.entries:
        sel = entry.metadata.selection
        if sel and sel.comment_suggests_change:
            results[entry.metadata.id] = CommentGenSubmission(
                path=entry.comments[0].file,
                line_from=entry.comments[0].from_,
                line_to=entry.comments[0].to,
                body=entry.comments[0].body,
            )

    # Write out the exact predictions reference JSON
    with open(output_path, "w", encoding="utf-8") as out_file:
        json.dump(results, out_file, default=lambda o: o.__dict__, indent=4)

    print(f"Saved {len(results)} entries to {output_path}")


def extract_refinement_predictions(dataset_path: str, archives_path: str, output_path: str):
    # Load the dataset
    dataset = Dataset.from_json(dataset_path)
    results = {}

    # Iterate over entries that address the change
    for entry in dataset.entries:
        sel = entry.metadata.selection
        if not sel or not (sel.diff_after_address_change and sel.is_code_related):
            continue
        entry_id = entry.metadata.id

        # Determine the merged archive filename
        archive_filename = entry.metadata.archive_name(ArchiveState.MERGED)
        archive_path = os.path.join(archives_path, archive_filename)
        if not os.path.exists(archive_path):
            print(f"Archive not found: {archive_path}")
            continue

        # Extract file contents after merge
        with tarfile.open(archive_path, "r:gz") as tar:
            file_contents = {}
            for filename in entry.diffs_after.keys():
                # Find the member matching the file path
                member = next((m for m in tar.getmembers() if m.name.endswith(filename)), None)
                if member is None:
                    print(f"File {filename} not found in {archive_path}")
                    continue
                f = tar.extractfile(member)
                if f is None:
                    print(f"Could not extract {filename} from {archive_path}")
                    continue
                content = f.read().decode("utf-8", errors="replace")
                file_contents[filename] = content

        results[entry_id] = file_contents

    # Write out the exact predictions reference JSON
    with open(output_path, "w", encoding="utf-8") as out_file:
        json.dump(results, out_file, indent=4)

    print(f"Saved {len(results)} entries to {output_path}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Extract merged archive contents for entries addressing a change"
    )
    parser.add_argument(
        'dataset',
        help='Path to the dataset JSON file',
    )
    parser.add_argument(
        '-o',
        '--output',
        help='Path to the output JSON file. Default is `exact_predictions_{output-type}.json`',
    )
    parser.add_argument(
        '-a',
        '--archives',
        help='Directory where archive files are located. Required if output type is code_refinement',
    )
    parser.add_argument(
        "-t",
        "--output-type",
        type=OutputType,
        default=OutputType.COMMENT_GEN,
        action=EnumChoicesAction,
        help="Type of output to generate",
    )
    args = parser.parse_args()

    output_type = OutputType(args.output_type)
    if args.output is None:
        args.output = f"exact_predictions_{output_type.value}.json"

    if output_type is OutputType.COMMENT_GEN:
        extract_comment_predictions(args.dataset, args.output)
    elif output_type is OutputType.CODE_REFINEMENT:
        assert args.archives is not None
        extract_refinement_predictions(args.dataset, args.archives, args.output)
    elif output_type is OutputType.FOR_PARAPHRASES:
        extract_comment_for_paraphrases(args.dataset, args.output)