from dataclasses import dataclass from enum import Enum import os, json, tarfile, argparse from typing import Optional from dataset import Dataset, ArchiveState from utils import EnumChoicesAction from tqdm import tqdm class OutputType(Enum): CODE_REFINEMENT = "code_refinement" COMMENT_GEN = "comment_gen" FOR_PARAPHRASES = "paraphrases" @dataclass class CommentGenSubmission: path: str from_: int to: Optional[int] body: str def extract_comment_for_paraphrases(dataset_path: str, output_path: str): dataset = Dataset.from_json(dataset_path) results: dict[str, dict] = {} for entry in dataset.entries: sel = entry.metadata.selection if sel and sel.comment_suggests_change: comment = entry.comments[0].__dict__ del comment["paraphrases"] results[entry.metadata.id] = { "link": f"https://github.com/{entry.metadata.repo}/pull/{entry.metadata.pr_number}", "comment": comment, "files": {fname: fdata.content_before_pr for fname, fdata in entry.files.items()}, "diffs_before": entry.diffs_before, } # Write out the exact predictions reference JSON with open(output_path, "w", encoding="utf-8") as out_file: json.dump(results, out_file, default=lambda o: o.__dict__, indent=4) print(f"Saved {len(results)} entries to {output_path}") def extract_comment_predictions(dataset_path: str, output_path: str): dataset = Dataset.from_json(dataset_path) results: dict[str, CommentGenSubmission] = {} for entry in dataset.entries: sel = entry.metadata.selection if sel and sel.comment_suggests_change: results[entry.metadata.id] = CommentGenSubmission( path=entry.comments[0].file, from_=entry.comments[0].from_, to=entry.comments[0].to, body=entry.comments[0].body, ) # Write out the exact predictions reference JSON with open(output_path, "w", encoding="utf-8") as out_file: json.dump(results, out_file, default=lambda o: o.__dict__, indent=4) print(f"Saved {len(results)} entries to {output_path}") def extract_refinement_predictions(dataset_path: str, archives_path: str, output_path: str): # Load the dataset dataset = Dataset.from_json(dataset_path) results = {} # Iterate over entries that address the change for entry in tqdm(dataset.entries, unit="entries"): sel = entry.metadata.selection if not sel or not ( sel.diff_after_address_change and entry.metadata.is_code_related and entry.metadata.is_covered ): continue entry_id = entry.metadata.id # Determine the merged archive filename archive_filename = entry.metadata.archive_name(ArchiveState.MERGED) archive_path = os.path.join(archives_path, archive_filename) if not os.path.exists(archive_path): print(f"Archive not found: {archive_path}") continue # Extract file contents after merge with tarfile.open(archive_path, "r:gz") as tar: file_contents = {} for filename in entry.diffs_after.keys(): # Find the member matching the file path member = next((m for m in tar.getmembers() if m.name.endswith(filename)), None) if member is None: print(f"File {filename} not found in {archive_path}") continue f = tar.extractfile(member) if f is None: print(f"Could not extract {filename} from {archive_path}") continue content = f.read().decode("utf-8", errors="replace") file_contents[filename] = content results[entry_id] = file_contents # Write out the exact predictions reference JSON with open(output_path, "w", encoding="utf-8") as out_file: json.dump(results, out_file, indent=4) print(f"Saved {len(results)} entries to {output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extract merged archive contents for entries addressing a change" ) parser.add_argument( 'dataset', help='Path to the dataset JSON file', ) parser.add_argument( '-o', '--output', help='Path to the output JSON file. Default is `exact_predictions_{output-type}.json`', ) parser.add_argument( '-a', '--archives', help='Directory where archive files are located. Required if output type is code_refinement', ) parser.add_argument( "-t", "--output-type", type=OutputType, default=OutputType.COMMENT_GEN, action=EnumChoicesAction, help="Type of output to generate", ) args = parser.parse_args() output_type = OutputType(args.output_type) if args.output is None: args.output = f"exact_predictions_{output_type.value}.json" if output_type is OutputType.COMMENT_GEN: extract_comment_predictions(args.dataset, args.output) elif output_type is OutputType.CODE_REFINEMENT: assert args.archives is not None extract_refinement_predictions(args.dataset, args.archives, args.output) elif output_type is OutputType.FOR_PARAPHRASES: extract_comment_for_paraphrases(args.dataset, args.output)