mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-04 13:18:13 +02:00
updated the way the comment generation and code
refinement inputs are exported (automatized the putting of archives for context)
This commit is contained in:
65
dataset.py
65
dataset.py
@ -1,6 +1,6 @@
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import sys, re
|
import sys, re, zipfile
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
import json, argparse, os, uuid
|
import json, argparse, os, uuid
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -155,7 +155,9 @@ class Dataset:
|
|||||||
self,
|
self,
|
||||||
filename: str,
|
filename: str,
|
||||||
type_: OutputType = OutputType.FULL,
|
type_: OutputType = OutputType.FULL,
|
||||||
|
archives_root: Optional[str] = None,
|
||||||
remove_non_suggesting: bool = False,
|
remove_non_suggesting: bool = False,
|
||||||
|
verbose: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Serialize the dataset to a JSON file"""
|
"""Serialize the dataset to a JSON file"""
|
||||||
|
|
||||||
@ -171,7 +173,9 @@ class Dataset:
|
|||||||
entries_to_dump = [
|
entries_to_dump = [
|
||||||
entry
|
entry
|
||||||
for entry in self.entries
|
for entry in self.entries
|
||||||
if entry.metadata.selection and entry.metadata.selection.diff_after_address_change
|
if entry.metadata.selection
|
||||||
|
and entry.metadata.selection.diff_after_address_change
|
||||||
|
and entry.metadata.is_covered
|
||||||
]
|
]
|
||||||
elif type_ in {OutputType.FULL, OutputType.WEBAPP} and remove_non_suggesting:
|
elif type_ in {OutputType.FULL, OutputType.WEBAPP} and remove_non_suggesting:
|
||||||
entries_to_dump = [
|
entries_to_dump = [
|
||||||
@ -209,8 +213,43 @@ class Dataset:
|
|||||||
if type_ == OutputType.CODE_REFINEMENT:
|
if type_ == OutputType.CODE_REFINEMENT:
|
||||||
return CodeRefinementEntry.from_entry(entry).__dict__
|
return CodeRefinementEntry.from_entry(entry).__dict__
|
||||||
|
|
||||||
with open(filename, "w", encoding="utf-8") as f:
|
if verbose:
|
||||||
json.dump(to_dump, f, default=transform_entry, indent=4)
|
print(f"{len(to_dump.entries)} entries...", end=" ", flush=True, file=sys.stderr)
|
||||||
|
json_data = json.dumps(to_dump, default=transform_entry, indent=4)
|
||||||
|
|
||||||
|
if type_ == OutputType.COMMENT_GEN or type_ == OutputType.CODE_REFINEMENT:
|
||||||
|
dirname = os.path.dirname(filename)
|
||||||
|
basename = os.path.basename(filename)
|
||||||
|
start, *middle, _ = basename.split('.')
|
||||||
|
zip_name = '.'.join(
|
||||||
|
[start + ('_with_context' if archives_root else '_no_context'), *middle, 'zip']
|
||||||
|
)
|
||||||
|
zip_path = os.path.join(dirname, zip_name)
|
||||||
|
|
||||||
|
with zipfile.ZipFile(zip_path, 'w') as zf:
|
||||||
|
zf.writestr(type_.value + "_input.json", json_data)
|
||||||
|
|
||||||
|
if archives_root:
|
||||||
|
for entry in to_dump.entries:
|
||||||
|
archive_src_name = entry.metadata.archive_name(ArchiveState.BASE)
|
||||||
|
archive_path = os.path.join(archives_root, archive_src_name)
|
||||||
|
if not os.path.exists(archive_path):
|
||||||
|
print(
|
||||||
|
f"[ERROR] The archive {archive_src_name} ({entry.metadata.repo} #{entry.metadata.pr_number}) is not present in {archives_root}. Couldn't add it to the dataset",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
archive_dest_name = entry.metadata.archive_name(
|
||||||
|
ArchiveState.BASE, only_id=True
|
||||||
|
).replace("_base", "")
|
||||||
|
with open(archive_path, 'rb') as archive_content:
|
||||||
|
zf.writestr(
|
||||||
|
os.path.join("context", archive_dest_name),
|
||||||
|
archive_content.read(),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
|
f.write(json_data)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
|
def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
|
||||||
@ -298,7 +337,7 @@ if __name__ == "__main__":
|
|||||||
"--output",
|
"--output",
|
||||||
type=str,
|
type=str,
|
||||||
default="output.json",
|
default="output.json",
|
||||||
help="Path to the output JSON file",
|
help="Path to the output JSON file. Default 'output.json'",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-p",
|
"-p",
|
||||||
@ -312,7 +351,13 @@ if __name__ == "__main__":
|
|||||||
type=OutputType,
|
type=OutputType,
|
||||||
default=OutputType.FULL,
|
default=OutputType.FULL,
|
||||||
action=EnumChoicesAction,
|
action=EnumChoicesAction,
|
||||||
help="Type of output to generate. webapp is just to keep what's necessary for the webapp to run, i.e. the metadata and the comments.",
|
help=f"Type of output to generate. Note that for the {OutputType.COMMENT_GEN.value} or {OutputType.CODE_REFINEMENT.value} types, the resulting file will be a compressed archive with the data and a '.zip' will be replace the output extension. {OutputType.WEBAPP.value} is just to keep what's necessary for the webapp to run, i.e. the metadata and the comments.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-a",
|
||||||
|
"--archives",
|
||||||
|
type=str,
|
||||||
|
help=f"Path to the root directory where the archives are present. Relevant only for {OutputType.COMMENT_GEN.value} or {OutputType.CODE_REFINEMENT.value}. If given, then the relevant archives are added to the resulting zipped dataset and the string '_with_context' will be added to the filename, before the extension. If not given, then the string '_no_context' will be added to the filename",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--remove-non-suggesting",
|
"--remove-non-suggesting",
|
||||||
@ -338,5 +383,11 @@ if __name__ == "__main__":
|
|||||||
print("Exiting without saving.")
|
print("Exiting without saving.")
|
||||||
exit(0)
|
exit(0)
|
||||||
print(f"Saving dataset to {args.output},", end=" ", flush=True)
|
print(f"Saving dataset to {args.output},", end=" ", flush=True)
|
||||||
dataset.to_json(args.output, args.output_type, args.remove_non_suggesting)
|
dataset.to_json(
|
||||||
|
args.output,
|
||||||
|
args.output_type,
|
||||||
|
args.archives,
|
||||||
|
args.remove_non_suggesting,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
print("Done")
|
print("Done")
|
||||||
|
Reference in New Issue
Block a user