mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 05:28:13 +02:00
extracting the file before, comment and file after
for the comments in a PR
This commit is contained in:
@ -2,6 +2,10 @@ import os, requests, re
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import itertools
|
import itertools
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from unidiff import PatchSet
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
COMMON_HEADERS = {
|
COMMON_HEADERS = {
|
||||||
'Accept': 'application/vnd.github+json',
|
'Accept': 'application/vnd.github+json',
|
||||||
@ -10,7 +14,10 @@ COMMON_HEADERS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def github_call(url):
|
def github_call(url):
|
||||||
return requests.get(url, headers=COMMON_HEADERS)
|
result = requests.get(url, headers=COMMON_HEADERS)
|
||||||
|
if result.status_code != 200:
|
||||||
|
raise Exception(f"Failed to fetch {url}: {result.status_code}")
|
||||||
|
return result
|
||||||
|
|
||||||
def get_comments(repo_url: str, pr_number: str) -> list[dict]:
|
def get_comments(repo_url: str, pr_number: str) -> list[dict]:
|
||||||
response = github_call(f'{repo_url}/pulls/{pr_number}/comments')
|
response = github_call(f'{repo_url}/pulls/{pr_number}/comments')
|
||||||
@ -99,7 +106,54 @@ def get_overlapping_commits_and_comments(commits: list[dict], comments: list[dic
|
|||||||
continue
|
continue
|
||||||
if file["filename"] == comment["path"]:
|
if file["filename"] == comment["path"]:
|
||||||
if is_range_overlapping(file["patch_range"]["old_range"], comment["hunk_range"]["new_range"]):
|
if is_range_overlapping(file["patch_range"]["old_range"], comment["hunk_range"]["new_range"]):
|
||||||
ret.append((commit, comment))
|
commit_copy = commit.copy()
|
||||||
|
commit_copy["relevant_file"] = file
|
||||||
|
ret.append((commit_copy, comment))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def reverse_patch(file_after: str, patch_content: str) -> str:
|
||||||
|
"""
|
||||||
|
Reverses a patch and applies it to a file to get the version of the file before the patch.
|
||||||
|
"""
|
||||||
|
# Parse the patch
|
||||||
|
patch = PatchSet(StringIO(patch_content))
|
||||||
|
|
||||||
|
# Extract the file to be patched
|
||||||
|
after_lines = file_after.splitlines(keepends=True)
|
||||||
|
|
||||||
|
for patched_file in patch:
|
||||||
|
if patched_file.is_modified_file:
|
||||||
|
original_lines = after_lines[:]
|
||||||
|
modified_lines = []
|
||||||
|
|
||||||
|
# Apply the patch in reverse
|
||||||
|
for hunk in patched_file:
|
||||||
|
hunk_lines = [str(line.value) for line in hunk.source_lines()]
|
||||||
|
new_start = hunk.target_start - 1
|
||||||
|
new_end = new_start + hunk.target_length
|
||||||
|
|
||||||
|
# Replace modified section with original content from patch
|
||||||
|
modified_lines.extend(original_lines[:new_start])
|
||||||
|
modified_lines.extend(hunk_lines)
|
||||||
|
original_lines = original_lines[new_end:]
|
||||||
|
|
||||||
|
modified_lines.extend(original_lines)
|
||||||
|
return "".join(modified_lines)
|
||||||
|
|
||||||
|
return file_after # Return unmodified if no patch applies
|
||||||
|
|
||||||
|
def extract_triplet(commit_comments: list[tuple[dict, dict]])-> list[dict]:
|
||||||
|
ret = []
|
||||||
|
for commit, comment in commit_comments:
|
||||||
|
file_after = github_call(commit["relevant_file"]["raw_url"]).text
|
||||||
|
filename = comment["path"]
|
||||||
|
patch_content = f"--- a/{filename}\n+++ b/{filename}\n" + commit["relevant_file"]["patch"] + "\n"
|
||||||
|
file_before = reverse_patch(file_after, patch_content)
|
||||||
|
ret.append({
|
||||||
|
"file_before": file_before,
|
||||||
|
"comment": comment["body"],
|
||||||
|
"file_after": file_after
|
||||||
|
})
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def process_pull_request(repo_url: str, pr_number: str) -> bool:
|
def process_pull_request(repo_url: str, pr_number: str) -> bool:
|
||||||
@ -124,12 +178,20 @@ def process_pull_request(repo_url: str, pr_number: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
overlapping_commits_and_comments = get_overlapping_commits_and_comments(commits, comments)
|
overlapping_commits_and_comments = get_overlapping_commits_and_comments(commits, comments)
|
||||||
|
|
||||||
for commit, comment in overlapping_commits_and_comments:
|
for commit, comment in overlapping_commits_and_comments:
|
||||||
print(f"Commit: {commit['sha']} address comment {comment['id']}")
|
print(f"Commit: {commit['sha']} address comment {comment['id']}")
|
||||||
print(f"Commit message: {commit['commit']['message']}")
|
print(f"Commit message: {commit['commit']['message']}")
|
||||||
print(f"Comment: {comment['body']}")
|
print(f"Comment: {comment['body']}")
|
||||||
|
print(commit["relevant_file"]['patch'])
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
triplets_df = pd.DataFrame(extract_triplet(overlapping_commits_and_comments))
|
||||||
|
repo_name = "/".join(repo_url.split("/")[-2:])
|
||||||
|
triplets_df["repo"] = repo_name
|
||||||
|
triplets_df["pr_number"] = pr_number
|
||||||
|
triplets_df.to_csv("triplets.csv", index=False)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -2,3 +2,4 @@ pandas
|
|||||||
tqdm
|
tqdm
|
||||||
docker
|
docker
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
|
unidiff
|
||||||
|
Reference in New Issue
Block a user