mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-04 13:18:13 +02:00
implemented new way to extract stats from dataset
This commit is contained in:
@ -1,60 +1,253 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from github import Github
|
||||
from utils import has_only_1_round_of_comments, has_only_1_comment, move_logger_to_file
|
||||
|
||||
tqdm.pandas()
|
||||
|
||||
# Initialize GitHub API client
|
||||
g = Github(os.environ["GITHUB_AUTH_TOKEN_CRAB"])
|
||||
import argparse, re, statistics
|
||||
from collections import defaultdict
|
||||
from dataset import Dataset
|
||||
from utils import EnumChoicesAction
|
||||
from enum import Enum
|
||||
|
||||
|
||||
def process_pull(repo, pull):
|
||||
commits = pull.get_commits()
|
||||
comments = pull.get_review_comments()
|
||||
|
||||
return {
|
||||
"repo": repo.full_name,
|
||||
"pr_number": pull.number,
|
||||
"additions": pull.additions,
|
||||
"deletions": pull.deletions,
|
||||
"changed_files": pull.changed_files,
|
||||
"has_only_1_round_of_comments": has_only_1_round_of_comments(commits, comments),
|
||||
"has_only_1_comment": has_only_1_comment(commits, comments),
|
||||
}
|
||||
def distrib_of_prs_per_repo(dataset: Dataset):
|
||||
repo2pr = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
repo2pr[entry.metadata.repo] += 1
|
||||
for repo, num_pr in repo2pr.items():
|
||||
print(f"{repo} {num_pr}")
|
||||
|
||||
|
||||
def process_repo(repo_name):
|
||||
repo = g.get_repo(repo_name)
|
||||
stats = []
|
||||
|
||||
with tqdm(list(repo.get_pulls(state="closed")), desc=repo_name, leave=False) as pbar:
|
||||
for pull in pbar:
|
||||
pbar.set_postfix({"started at": datetime.now().strftime("%d/%m, %H:%M:%S")})
|
||||
if not pull.merged_at:
|
||||
continue
|
||||
|
||||
stats.append(process_pull(repo, pull))
|
||||
return stats
|
||||
def count_entries(dataset: Dataset):
|
||||
print(f"Total entries in dataset: {len(dataset.entries)}")
|
||||
|
||||
|
||||
def main():
|
||||
repos = pd.read_csv("results.csv")
|
||||
repos = repos[(repos["good_repo_for_crab"] == True) & (repos["n_tests"] > 0)]
|
||||
stats = []
|
||||
def distrib_of_prs_per_repo_covered(dataset: Dataset):
|
||||
repo2pr: dict[str, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
if entry.metadata.is_covered:
|
||||
repo2pr[entry.metadata.repo] += 1
|
||||
for repo, num_pr in repo2pr.items():
|
||||
print(f"{repo} {num_pr}")
|
||||
|
||||
try:
|
||||
for _, row in tqdm(repos.iterrows(), total=len(repos)):
|
||||
if "name" not in row or not isinstance(row["name"], str):
|
||||
continue
|
||||
stats.extend(process_repo(row["name"]))
|
||||
pd.DataFrame(stats).to_csv("pr_stats.csv", index=False)
|
||||
finally:
|
||||
pd.DataFrame(stats).to_csv("pr_stats.csv", index=False)
|
||||
|
||||
def biggest_repo_comment_gen(dataset: Dataset):
|
||||
N = 5
|
||||
repo2pr: dict[str, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
repo2pr[entry.metadata.repo] += 1
|
||||
|
||||
total = sum(repo2pr.values())
|
||||
top_n = sorted(repo2pr, key=lambda e: repo2pr.get(e, 0), reverse=True)[:N]
|
||||
|
||||
print("Repo with larget number of PRs for comment gen:")
|
||||
print('\n'.join([f"{repo}: {repo2pr[repo]} ({repo2pr[repo]/total:.2%})" for repo in top_n]))
|
||||
|
||||
|
||||
def biggest_repo_refinement(dataset: Dataset):
|
||||
N = 5
|
||||
repo2pr: dict[str, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
if entry.metadata.is_covered:
|
||||
repo2pr[entry.metadata.repo] += 1
|
||||
|
||||
total = sum(repo2pr.values())
|
||||
top_n = sorted(repo2pr, key=lambda e: repo2pr.get(e, 0), reverse=True)[:N]
|
||||
|
||||
print("Repo with larget number of PRs for refinement:")
|
||||
print('\n'.join([f"{repo}: {repo2pr[repo]} ({repo2pr[repo]/total:.2%})" for repo in top_n]))
|
||||
|
||||
|
||||
def count_tokens(comment: str):
|
||||
return len(re.findall(r'\w+', comment))
|
||||
|
||||
|
||||
def tokens_per_comment(dataset: Dataset):
|
||||
all_counts = [count_tokens(entry.comments[0].body) for entry in dataset.entries]
|
||||
print('\n'.join([str(i) for i in all_counts]))
|
||||
return
|
||||
ntoken2count: dict[int, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
ntoken2count[count_tokens(entry.comments[0].body)] += 1
|
||||
|
||||
for k, v in ntoken2count.items():
|
||||
print(f"{k} {v}")
|
||||
|
||||
|
||||
def tokens_quartiles(dataset: Dataset):
|
||||
all_counts = [count_tokens(entry.comments[0].body) for entry in dataset.entries]
|
||||
q1, q2, q3 = statistics.quantiles(all_counts)
|
||||
print(f"Min {min(all_counts)}")
|
||||
print(f"Q1 = {q1}, Median = {q2}, Q3 = {q3}")
|
||||
print(f"Max {max(all_counts)}")
|
||||
|
||||
|
||||
def diff_before_sizes(dataset: Dataset):
|
||||
all_counts = [
|
||||
sum(len(diff.splitlines()) if diff else 0 for diff in entry.diffs_before.values())
|
||||
for entry in dataset.entries
|
||||
if entry.metadata.is_covered
|
||||
]
|
||||
print('\n'.join([str(i) for i in all_counts]))
|
||||
return
|
||||
diffsize2count: dict[int, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
diff_size = sum(
|
||||
len(diff.splitlines()) if diff else 0 for diff in entry.diffs_before.values()
|
||||
)
|
||||
diffsize2count[diff_size] += 1
|
||||
|
||||
for k, v in diffsize2count.items():
|
||||
print(f"{k} {v}")
|
||||
|
||||
|
||||
def diff_before_quartiles(dataset: Dataset):
|
||||
all_counts = [
|
||||
sum(len(diff.splitlines()) if diff else 0 for diff in entry.diffs_before.values())
|
||||
for entry in dataset.entries
|
||||
]
|
||||
q1, q2, q3 = statistics.quantiles(all_counts)
|
||||
print(f"Min {min(all_counts)}")
|
||||
print(f"Q1 = {q1}, Median = {q2}, Q3 = {q3}")
|
||||
print(f"Max {max(all_counts)}")
|
||||
|
||||
|
||||
def n_files_before(dataset: Dataset):
|
||||
all_counts = [
|
||||
sum(1 if diff else 0 for diff in entry.diffs_before.values())
|
||||
for entry in dataset.entries
|
||||
if entry.metadata.is_covered
|
||||
]
|
||||
print('\n'.join([str(i) for i in all_counts]))
|
||||
return
|
||||
nfiles2count: dict[int, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
n_files = sum(1 if diff else 0 for diff in entry.diffs_before.values())
|
||||
nfiles2count[n_files] += 1
|
||||
|
||||
for k, v in nfiles2count.items():
|
||||
print(f"{k} {v}")
|
||||
|
||||
|
||||
def diff_after_sizes(dataset: Dataset):
|
||||
all_counts = [
|
||||
sum(len(diff.splitlines()) if diff else 0 for diff in entry.diffs_after.values())
|
||||
for entry in dataset.entries
|
||||
if entry.metadata.is_covered
|
||||
]
|
||||
print('\n'.join([str(i) for i in all_counts]))
|
||||
return
|
||||
diffsize2count: dict[int, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
if entry.metadata.is_covered:
|
||||
diff_size = sum(
|
||||
len(diff.splitlines()) if diff else 0 for diff in entry.diffs_after.values()
|
||||
)
|
||||
diffsize2count[diff_size] += 1
|
||||
|
||||
for k, v in diffsize2count.items():
|
||||
print(f"{k} {v}")
|
||||
|
||||
|
||||
def n_files_after(dataset: Dataset):
|
||||
all_counts = [
|
||||
sum(1 if diff else 0 for diff in entry.diffs_after.values())
|
||||
for entry in dataset.entries
|
||||
if entry.metadata.is_covered
|
||||
]
|
||||
print('\n'.join([str(i) for i in all_counts]))
|
||||
return
|
||||
nfiles2count: dict[int, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
if entry.metadata.is_covered:
|
||||
n_files = sum(1 if diff else 0 for diff in entry.diffs_after.values())
|
||||
nfiles2count[n_files] += 1
|
||||
|
||||
for k, v in nfiles2count.items():
|
||||
print(f"{k} {v}")
|
||||
|
||||
|
||||
def diff_after_sizes_selected(dataset: Dataset):
|
||||
all_counts = [
|
||||
sum(len(diff.splitlines()) if diff else 0 for diff in entry.diffs_after.values())
|
||||
for entry in dataset.entries
|
||||
if entry.metadata.is_covered
|
||||
if entry.metadata.selection and entry.metadata.selection.diff_after_address_change
|
||||
]
|
||||
print('\n'.join([str(i) for i in all_counts]))
|
||||
return
|
||||
diffsize2count: dict[int, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
if entry.metadata.is_covered:
|
||||
if entry.metadata.selection and entry.metadata.selection.diff_after_address_change:
|
||||
diff_size = sum(
|
||||
len(diff.splitlines()) if diff else 0 for diff in entry.diffs_after.values()
|
||||
)
|
||||
diffsize2count[diff_size] += 1
|
||||
|
||||
for k, v in diffsize2count.items():
|
||||
print(f"{k} {v}")
|
||||
|
||||
|
||||
def n_files_after_selected(dataset: Dataset):
|
||||
all_counts = [
|
||||
sum(1 if diff else 0 for diff in entry.diffs_after.values())
|
||||
for entry in dataset.entries
|
||||
if entry.metadata.is_covered
|
||||
if entry.metadata.selection and entry.metadata.selection.diff_after_address_change
|
||||
]
|
||||
print('\n'.join([str(i) for i in all_counts]))
|
||||
return
|
||||
|
||||
nfiles2count: dict[int, int] = defaultdict(int)
|
||||
for entry in dataset.entries:
|
||||
if entry.metadata.is_covered:
|
||||
if entry.metadata.selection and entry.metadata.selection.diff_after_address_change:
|
||||
n_files = sum(1 if diff else 0 for diff in entry.diffs_after.values())
|
||||
nfiles2count[n_files] += 1
|
||||
|
||||
for k, v in nfiles2count.items():
|
||||
print(f"{k} {v}")
|
||||
|
||||
|
||||
class Action(Enum):
|
||||
COUNT = ("count", count_entries)
|
||||
DISTRIB = ("distrib", distrib_of_prs_per_repo)
|
||||
DISTRIB_COVERED = ("distrib_covered", distrib_of_prs_per_repo_covered)
|
||||
BIGGEST_REPO_COMMENT_GEN = ("biggest_repo_comment_gen", biggest_repo_comment_gen)
|
||||
BIGGEST_REPO_REFINEMENT = ("biggest_repo_refinement", biggest_repo_refinement)
|
||||
TOKENS = ("tokens", tokens_per_comment)
|
||||
TOKENS_QUARTILES = ("tokens_quartiles", tokens_quartiles)
|
||||
DIFF_BEFORE_SIZES = ("diff_before_sizes", diff_before_sizes)
|
||||
DIFF_BEFORE_QUARTILES = ("diff_before_quartiles", diff_before_quartiles)
|
||||
N_FILES_BEFORE = ("n_files_before", n_files_before)
|
||||
DIFF_AFTER_SIZES = ("diff_after_sizes", diff_after_sizes)
|
||||
N_FILES_AFTER = ("n_files_after", n_files_after)
|
||||
DIFF_AFTER_SIZES_SELECTED = ("diff_after_sizes_selected", diff_after_sizes_selected)
|
||||
N_FILES_AFTER_SELECTED = ("n_files_after_selected", n_files_after_selected)
|
||||
|
||||
def __new__(cls, value, func):
|
||||
# This __new__ assigns the “value” for each member (for argparse/choices),
|
||||
# and also stashes the function pointer into a .func attribute.
|
||||
obj = object.__new__(cls)
|
||||
obj._value_ = value
|
||||
obj.func = func # type: ignore
|
||||
return obj
|
||||
|
||||
def perform(self, dataset):
|
||||
# Simply call the stored function, passing the dataset.
|
||||
return self.func(dataset) # type: ignore
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
move_logger_to_file("github", "github_api.log")
|
||||
main()
|
||||
parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.')
|
||||
parser.add_argument(
|
||||
'dataset',
|
||||
type=str,
|
||||
help='The dataset to extract data from',
|
||||
)
|
||||
parser.add_argument(
|
||||
'action',
|
||||
type=Action,
|
||||
action=EnumChoicesAction,
|
||||
help='Action to perform on the data',
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
args.action.perform(Dataset.from_json(args.dataset))
|
||||
|
Reference in New Issue
Block a user