From 3a4bfd611bfd00f226af91175db92a04a053bfaf Mon Sep 17 00:00:00 2001 From: Karma Riuk Date: Tue, 13 May 2025 13:27:38 +0200 Subject: [PATCH] ported backend to python --- .gitignore | 256 +++++++++++++++----------- jest.config.js | 8 - package.json | 23 --- public/index.html | 2 +- public/js/index.js | 2 +- src/routes/__tests__/datasets.test.js | 92 --------- src/routes/answers.js | 129 ------------- src/routes/answers.py | 55 ++++++ src/routes/datasets.js | 33 ---- src/routes/datasets.py | 17 ++ src/routes/index.js | 23 --- src/routes/index.py | 16 ++ src/server.js | 25 --- src/server.py | 35 ++++ src/socket.js | 14 -- src/utils/bleu.js | 125 ------------- src/utils/errors.js | 6 - src/utils/errors.py | 4 + src/utils/paths.js | 11 -- src/utils/paths.py | 8 + src/utils/process_data.js | 58 ------ src/utils/process_data.py | 46 +++++ 22 files changed, 330 insertions(+), 658 deletions(-) delete mode 100644 jest.config.js delete mode 100644 package.json delete mode 100644 src/routes/__tests__/datasets.test.js delete mode 100644 src/routes/answers.js create mode 100644 src/routes/answers.py delete mode 100644 src/routes/datasets.js create mode 100644 src/routes/datasets.py delete mode 100644 src/routes/index.js create mode 100644 src/routes/index.py delete mode 100644 src/server.js create mode 100644 src/server.py delete mode 100644 src/socket.js delete mode 100644 src/utils/bleu.js delete mode 100644 src/utils/errors.js create mode 100644 src/utils/errors.py delete mode 100644 src/utils/paths.js create mode 100644 src/utils/paths.py delete mode 100644 src/utils/process_data.js create mode 100644 src/utils/process_data.py diff --git a/.gitignore b/.gitignore index b9ed5c4..0a19790 100644 --- a/.gitignore +++ b/.gitignore @@ -1,136 +1,174 @@ -# Logs -logs +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: *.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -lerna-debug.log* -.pnpm-debug.log* +local_settings.py +db.sqlite3 +db.sqlite3-journal -# Diagnostic reports (https://nodejs.org/api/report.html) -report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json +# Flask stuff: +instance/ +.webassets-cache -# Runtime data -pids -*.pid -*.seed -*.pid.lock +# Scrapy stuff: +.scrapy -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov +# Sphinx documentation +docs/_build/ -# Coverage directory used by tools like istanbul -coverage -*.lcov +# PyBuilder +.pybuilder/ +target/ -# nyc test coverage -.nyc_output +# Jupyter Notebook +.ipynb_checkpoints -# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) -.grunt +# IPython +profile_default/ +ipython_config.py -# Bower dependency directory (https://bower.io/) -bower_components +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version -# node-waf configuration -.lock-wscript +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock -# Compiled binary addons (https://nodejs.org/api/addons.html) -build/Release +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock -# Dependency directories -node_modules/ -jspm_packages/ +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock -# Snowpack dependency directory (https://snowpack.dev/) -web_modules/ +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ -# TypeScript cache -*.tsbuildinfo +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ -# Optional npm cache directory -.npm +# Celery stuff +celerybeat-schedule +celerybeat.pid -# Optional eslint cache -.eslintcache +# SageMath parsed files +*.sage.py -# Optional stylelint cache -.stylelintcache - -# Microbundle cache -.rpt2_cache/ -.rts2_cache_cjs/ -.rts2_cache_es/ -.rts2_cache_umd/ - -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz - -# Yarn Integrity file -.yarn-integrity - -# dotenv environment variable files +# Environments .env -.env.development.local -.env.test.local -.env.production.local -.env.local +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ -# parcel-bundler cache (https://parceljs.org/) -.cache -.parcel-cache +# Spyder project settings +.spyderproject +.spyproject -# Next.js build output -.next -out +# Rope project settings +.ropeproject -# Nuxt.js build / generate output -.nuxt -dist +# mkdocs documentation +/site -# Gatsby files -.cache/ -# Comment in the public line in if your project uses Gatsby and not Next.js -# https://nextjs.org/blog/next-9-1#public-directory-support -# public +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json -# vuepress build output -.vuepress/dist +# Pyre type checker +.pyre/ -# vuepress v2.x temp and cache directory -.temp -.cache +# pytype static type analyzer +.pytype/ -# vitepress build output -**/.vitepress/dist +# Cython debug symbols +cython_debug/ -# vitepress cache directory -**/.vitepress/cache +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ -# Docusaurus cache and generated files -.docusaurus +# Ruff stuff: +.ruff_cache/ -# Serverless directories -.serverless/ - -# FuseBox cache -.fusebox/ - -# DynamoDB Local files -.dynamodb/ - -# TernJS port file -.tern-port - -# Stores VSCode versions used for testing VSCode extensions -.vscode-test - -# yarn v2 -.yarn/cache -.yarn/unplugged -.yarn/build-state.yml -.yarn/install-state.gz -.pnp.* \ No newline at end of file +# PyPI configuration file +.pypirc diff --git a/jest.config.js b/jest.config.js deleted file mode 100644 index 630e300..0000000 --- a/jest.config.js +++ /dev/null @@ -1,8 +0,0 @@ -export default { - transform: {}, - moduleNameMapper: { - '^(\\.{1,2}/.*)\\.js$': '$1', - }, - testEnvironment: 'node', - verbose: true -}; \ No newline at end of file diff --git a/package.json b/package.json deleted file mode 100644 index 93f6a76..0000000 --- a/package.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "name": "crab-webapp", - "description": "Crab Webapp", - "type": "module", - "main": "src/server.js", - "scripts": { - "start": "node src/server.js", - "dev": "nodemon src/server.js", - "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js" - }, - "dependencies": { - "cors": "^2.8.5", - "dotenv": "^16.3.1", - "express": "^4.18.2", - "multer": "^1.4.5-lts.1", - "socket.io": "^4.8.1" - }, - "devDependencies": { - "jest": "^29.7.0", - "nodemon": "^3.0.2", - "supertest": "^6.3.4" - } -} diff --git a/public/index.html b/public/index.html index 06263fc..c2a048f 100644 --- a/public/index.html +++ b/public/index.html @@ -7,7 +7,7 @@ Dataset Downloader & Answer Uploader - + diff --git a/public/js/index.js b/public/js/index.js index 22563a4..7a56e0e 100644 --- a/public/js/index.js +++ b/public/js/index.js @@ -60,7 +60,7 @@ document.getElementById("upload-btn").onclick = async () => { idCell.textContent = id; commentCell.innerHTML = `${info["proposed_comment"]}`; - scoreCell.textContent = info["max_bleu_score"].toFixed(4); + scoreCell.textContent = info["max_bleu_score"]; }); }; diff --git a/src/routes/__tests__/datasets.test.js b/src/routes/__tests__/datasets.test.js deleted file mode 100644 index 8a75e29..0000000 --- a/src/routes/__tests__/datasets.test.js +++ /dev/null @@ -1,92 +0,0 @@ -import { jest } from '@jest/globals'; -import express from 'express'; -import request from 'supertest'; -import { join } from 'path'; -import { fileURLToPath } from 'url'; -import { dirname } from 'path'; -import datasetsRouter from '../datasets.js'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -// Mock the paths utility -jest.mock('../../utils/paths.js', () => ({ - getProjectPath: (path) => join(__dirname, '../../..', path) -})); - -// Create Express app for testing -const app = express(); -app.use('/datasets', datasetsRouter); - -describe('Datasets Router', () => { - // Mock environment variables - const originalEnv = process.env; - beforeEach(() => { - jest.resetModules(); - process.env = { ...originalEnv }; - process.env.DATA_DIR = './test-data'; - }); - - afterEach(() => { - process.env = originalEnv; - }); - - describe('GET /download/:dataset', () => { - it('should return 400 for invalid dataset name', async () => { - const response = await request(app) - .get('/datasets/download/invalid_dataset') - .expect(400); - - expect(response.body).toEqual({ - error: 'Invalid dataset name' - }); - }); - - it('should download comment_generation without context', async () => { - const response = await request(app) - .get('/datasets/download/comment_generation') - .expect(200); - - expect(response.headers['content-type']).toBe('application/zip'); - expect(response.headers['content-disposition']).toContain('comment_generation_no_context.zip'); - }); - - it('should download comment_generation with context', async () => { - const response = await request(app) - .get('/datasets/download/comment_generation') - .query({ withContext: true }) - .expect(200); - - expect(response.headers['content-type']).toBe('application/zip'); - expect(response.headers['content-disposition']).toContain('comment_generation_with_context.zip'); - }); - - it('should download code_refinement without context', async () => { - const response = await request(app) - .get('/datasets/download/code_refinement') - .expect(200); - - expect(response.headers['content-type']).toBe('application/zip'); - expect(response.headers['content-disposition']).toContain('code_refinement_no_context.zip'); - }); - - it('should download code_refinement with context', async () => { - const response = await request(app) - .get('/datasets/download/code_refinement') - .query({ withContext: true }) - .expect(200); - - expect(response.headers['content-type']).toBe('application/zip'); - expect(response.headers['content-disposition']).toContain('code_refinement_with_context.zip'); - }); - - it('should handle JSON boolean for withContext parameter', async () => { - const response = await request(app) - .get('/datasets/download/comment_generation') - .query({ withContext: 'true' }) - .expect(200); - - expect(response.headers['content-disposition']).toContain('comment_generation_with_context.zip'); - }); - }); -}); \ No newline at end of file diff --git a/src/routes/answers.js b/src/routes/answers.js deleted file mode 100644 index 24e0687..0000000 --- a/src/routes/answers.js +++ /dev/null @@ -1,129 +0,0 @@ -import { Router } from "express"; -import multer from "multer"; -import { InvalidJsonFormatError } from "../utils/errors.js"; -import { evaluate_comments } from "../utils/process_data.js"; - -const router = Router(); - -// Configure multer for file uploads -const upload = multer({ - storage: multer.memoryStorage(), - limits: { - fileSize: 200 * 1024 * 1024, // 200MB limit, since the comement gen is 147MB (deflated) - }, - fileFilter: (_req, file, cb) => { - // Accept only JSON files - if (file.mimetype === "application/json") { - cb(null, true); - } else { - cb(new Error("Only JSON files are allowed")); - } - }, -}); - -// Helper function to validate JSON format -const validateJsonFormat = (data) => { - try { - const parsed = JSON.parse(data); - // Check if it's an object - if ( - typeof parsed !== "object" || - parsed === null || - Array.isArray(parsed) - ) { - throw new InvalidJsonFormatError( - "Submitted json doesn't contain an object", - ); - } - // Check if all values are strings - if ( - !Object.values(parsed).every((value) => typeof value === "string") - ) { - throw new InvalidJsonFormatError( - "Submitted json object must only be str -> str. Namely id -> comment", - ); - } - return parsed; - } catch (error) { - if (error instanceof InvalidJsonFormatError) { - throw error; - } - throw new InvalidJsonFormatError("Invalid JSON format"); - } -}; - -router.post("/submit/comments", upload.single("file"), async (req, res) => { - try { - if (!req.file) { - return res.status(400).json({ error: "No file uploaded" }); - } - - const fileContent = req.file.buffer.toString(); - let validatedData; - - try { - validatedData = validateJsonFormat(fileContent); - } catch (error) { - if (error instanceof InvalidJsonFormatError) { - return res.status(400).json({ - error: "Invalid JSON format", - message: error.message, - }); - } - throw error; - } - - const io = req.app.get("io"); - const header = req.get("X-Socket-Id"); - const socketId = header && header.trim(); - if (socketId && io.sockets.sockets.has(socketId)) { - io.to(socketId).emit("successul-upload"); - io.to(socketId).emit("started-processing"); - } - - const results = evaluate_comments(validatedData, (percent) => { - if (!(socketId && io.sockets.sockets.has(socketId))) return; - - io.to(socketId).emit("progress", { percent }); - }); - res.status(200).json(results); - } catch (error) { - console.error("Error processing submission:", error); - res.status(500).json({ error: "Error processing submission" }); - } -}); - -router.post("/submit/refinement", upload.single("file"), async (req, res) => { - try { - if (!req.file) { - return res.status(400).json({ error: "No file uploaded" }); - } - - const fileContent = req.file.buffer.toString(); - let validatedData; - - try { - validatedData = validateJsonFormat(fileContent); - } catch (error) { - if (error instanceof InvalidJsonFormatError) { - return res.status(400).json({ - error: "Invalid JSON format", - message: error.message, - }); - } - throw error; - } - - socket.emit("started-processing"); - evaluate_comments(validatedData); - res.status(200).json({ - message: "Answer submitted successfully", - data: validatedData, - }); - } catch (error) { - console.error("Error processing submission:", error); - res.status(500).json({ error: "Error processing submission" }); - } -}); - -export default router; diff --git a/src/routes/answers.py b/src/routes/answers.py new file mode 100644 index 0000000..b1962c3 --- /dev/null +++ b/src/routes/answers.py @@ -0,0 +1,55 @@ +# routes/answers.py +from flask import Blueprint, request, jsonify, current_app +from utils.errors import InvalidJsonFormatError +from utils.process_data import evaluate_comments +import json + +router = Blueprint('answers', __name__, url_prefix='/answers') + +ALLOWED_EXT = {'json'} + + +def validate_json_format(data: str) -> dict[str, str]: + try: + obj = json.loads(data) + if not isinstance(obj, dict): + raise InvalidJsonFormatError("Submitted json doesn't contain an object") + if not all(isinstance(v, str) for v in obj.values()): + raise InvalidJsonFormatError( + "Submitted json object must only be str -> str. Namely id -> comment" + ) + return obj + except InvalidJsonFormatError as e: + raise e + except Exception: + raise InvalidJsonFormatError() + + +@router.route('/submit/comments', methods=['POST']) +def submit_comments(): + file = request.files.get('file') + if file is None or file.filename is None or file.filename.split('.')[-1] not in ALLOWED_EXT: + return jsonify({'error': 'Only JSON files are allowed'}), 400 + data = file.read().decode() + try: + validated = validate_json_format(data) + except InvalidJsonFormatError as e: + return jsonify({'error': 'Invalid JSON format', 'message': str(e)}), 400 + + socketio = current_app.extensions['socketio'] + sid = request.headers.get('X-Socket-Id') + if sid: + socketio.emit('successful-upload', room=sid) + socketio.emit('started-processing', room=sid) + + results = evaluate_comments( + validated, lambda p: socketio.emit('progress', {'percent': p}, room=sid) + ) + return jsonify(results) + + +@router.route('/submit/refinement', methods=['POST']) +def submit_refinement(): + file = request.files.get('file') + # similar to above + return jsonify({'message': 'Answer submitted successfully'}) diff --git a/src/routes/datasets.js b/src/routes/datasets.js deleted file mode 100644 index e367e52..0000000 --- a/src/routes/datasets.js +++ /dev/null @@ -1,33 +0,0 @@ -import { Router } from "express"; -import { join } from "path"; -import { getProjectPath } from "../utils/paths.js"; - -const router = Router(); - -// Environment variables for paths (all relative to project root) -const DATA_DIR = getProjectPath("data"); - -const DATASETS = ["comment_generation", "code_refinement"]; - -router.get("/download/:dataset", async (req, res) => { - const { dataset } = req.params; - const withContext = req.query.withContext - ? JSON.parse(req.query.withContext) - : false; - - if (!DATASETS.includes(dataset)) { - return res.status(400).json({ error: "Invalid dataset name" }); - } - - const fileName = `${dataset}_${withContext ? "with_context" : "no_context"}.zip`; - const filePath = join(DATA_DIR, fileName); - - try { - res.download(filePath); - } catch (error) { - console.error("Error serving file:", error); - res.status(500).json({ error: "Error serving file" }); - } -}); - -export default router; diff --git a/src/routes/datasets.py b/src/routes/datasets.py new file mode 100644 index 0000000..c610f11 --- /dev/null +++ b/src/routes/datasets.py @@ -0,0 +1,17 @@ +# routes/datasets.py +from flask import Blueprint, send_from_directory, request, jsonify +from utils.paths import get_project_path + +router = Blueprint('datasets', __name__, url_prefix='/datasets') + +DATASETS = {'comment_generation', 'code_refinement'} +DATA_DIR = get_project_path('../data') + + +@router.route('/download/') +def download(dataset): + if dataset not in DATASETS: + return jsonify({'error': 'Invalid dataset name'}), 400 + with_ctx = request.args.get('withContext', 'false').lower() == 'true' + fname = f"{dataset}_{'with_context' if with_ctx else 'no_context'}.zip" + return send_from_directory(DATA_DIR, fname, as_attachment=True) diff --git a/src/routes/index.js b/src/routes/index.js deleted file mode 100644 index da876e2..0000000 --- a/src/routes/index.js +++ /dev/null @@ -1,23 +0,0 @@ -import { Router } from 'express'; -import datasetRoutes from './datasets.js'; -import answerRoutes from './answers.js'; - -const router = Router(); - -// Routes -router.get('/', (_req, res) => { - res.json({ message: 'Welcome to the Express backend!' }); -}); - -// Example route -router.get('/api/hello', (_req, res) => { - res.json({ message: 'Hello from the backend!' }); -}); - -// Dataset routes -router.use('/datasets', datasetRoutes); - -// Answer submission routes -router.use('/answers', answerRoutes); - -export default router; diff --git a/src/routes/index.py b/src/routes/index.py new file mode 100644 index 0000000..6ef63b3 --- /dev/null +++ b/src/routes/index.py @@ -0,0 +1,16 @@ +# routes/index.py +from flask import Blueprint, jsonify, current_app + + +router = Blueprint('index', __name__) + + +@router.route('/') +def welcome(): + print("hello") + return current_app.send_static_file('index.html') + + +@router.route('/api/hello') +def hello(): + return jsonify({'message': 'Hello from the backend!'}) diff --git a/src/server.js b/src/server.js deleted file mode 100644 index bcd407b..0000000 --- a/src/server.js +++ /dev/null @@ -1,25 +0,0 @@ -import express, { json } from "express"; -import cors from "cors"; -import dotenv from "dotenv"; -import routes from "./routes/index.js"; -import { createSocketServer } from "./socket.js"; - -dotenv.config(); - -const app = express(); -const port = process.env.PORT || 3000; - -// Middleware -app.use(cors()); -app.use(json()); - -// Use routes -app.use(express.static("public")); -app.use("/", routes); - -const server = createSocketServer(app); - -// Start server -server.listen(port, () => { - console.log(`Server is running on port ${port}`); -}); diff --git a/src/server.py b/src/server.py new file mode 100644 index 0000000..bd31b06 --- /dev/null +++ b/src/server.py @@ -0,0 +1,35 @@ +# server.py +from flask import Flask +from flask_cors import CORS +from flask_socketio import SocketIO +from routes.index import router as index_router +from routes.answers import router as answers_router +from routes.datasets import router as datasets_router + +import os + +app = Flask(__name__, static_folder='../public', static_url_path='/') +CORS(app) + +# Register routes +app.register_blueprint(index_router) # serves '/' and '/api/hello' +app.register_blueprint(answers_router) # mounts at '/answers' +app.register_blueprint(datasets_router) # mounts at '/datasets' + + +def init_socketio(app): + socketio = SocketIO(app, cors_allowed_origins='*') + + @socketio.on('connect') + def _(): + print('Websocket client connected') + + return socketio + + +# Init socketio +socketio = init_socketio(app) + +if __name__ == '__main__': + port = int(os.getenv('PORT', 3000)) + socketio.run(app, port=port) diff --git a/src/socket.js b/src/socket.js deleted file mode 100644 index e09c0ef..0000000 --- a/src/socket.js +++ /dev/null @@ -1,14 +0,0 @@ -import http from "http"; -import { Server } from "socket.io"; - -function onConnect(socket) { - console.log("Websocket client connected:", socket.id); -} - -export function createSocketServer(app) { - const httpServer = http.createServer(app); - const io = new Server(httpServer); - io.on("connection", onConnect); - app.set("io", io); - return httpServer; -} diff --git a/src/utils/bleu.js b/src/utils/bleu.js deleted file mode 100644 index aa94411..0000000 --- a/src/utils/bleu.js +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Calculates BLEU score between a reference and candidate sentence. - * Reference and candidate should be token arrays (e.g. split by whitespace). - * We compute modified n-gram precisions for n=1..4, geometric mean, with smoothing (optional). - * We include the brevity penalty. - */ - -/** - * Extracts n-grams from a sequence of tokens. - * @param {string[]} tokens - Array of tokens. - * @param {number} n - Size of the n-gram. - * @returns {Object} Map from n-gram string to its count. - */ -function getNGramCounts(tokens, n) { - const counts = Object.create(null); - for (let i = 0; i + n <= tokens.length; i++) { - const gram = tokens.slice(i, i + n).join(" "); - counts[gram] = (counts[gram] || 0) + 1; - } - return counts; -} - -/** - * Computes modified precision for a given n. - * @param {string[]} reference - Reference token array. - * @param {string[]} candidate - Candidate token array. - * @param {number} n - n-gram order. - * @returns {number} Modified n-gram precision. - */ -function modifiedPrecision(reference, candidate, n) { - const refCounts = getNGramCounts(reference, n); - const candCounts = getNGramCounts(candidate, n); - let matchCount = 0; - let totalCount = 0; - - for (const gram in candCounts) { - const countCand = candCounts[gram]; - const countRef = refCounts[gram] || 0; - matchCount += Math.min(countCand, countRef); - totalCount += countCand; - } - - // Avoid division by zero - return totalCount === 0 ? 0 : matchCount / totalCount; -} - -/** - * Computes brevity penalty. - * @param {number} refLength - Length of reference sentence. - * @param {number} candLength - Length of candidate sentence. - * @returns {number} Brevity penalty. - */ -function brevityPenalty(refLength, candLength) { - if (candLength > refLength) { - return 1; - } - if (candLength === 0) { - return 0; - } - return Math.exp(1 - refLength / candLength); -} - -/** - * Computes BLEU score. - * @param {string} refSentence - Reference sentence. - * @param {string} candSentence - Candidate sentence. - * @param {number} maxN - Maximum n-gram order (default=4). - * @param {boolean} smooth - Whether to apply smoothing (default=false). - * @returns {number} BLEU score between 0 and 1. - */ -export function bleu(refSentence, candSentence, maxN = 4, smooth = false) { - const reference = refSentence.trim().split(/\s+/); - const candidate = candSentence.trim().split(/\s+/); - const refLen = reference.length; - const candLen = candidate.length; - - // count how many times we've hit a zero count so far - const precisions = []; - for (let n = 1; n <= maxN; n++) { - let p = modifiedPrecision(reference, candidate, n); - if (p === 0 && smooth) { - p = 1 / Math.pow(candLen, n); - } - precisions.push(p); - } - - // Compute geometric mean of precisions - // if any precision is zero (and no smoothing), BLEU=0 - if (precisions.some((p) => p === 0)) { - return 0; - } - - const logPrecisionSum = - precisions.map((p) => Math.log(p)).reduce((a, b) => a + b, 0) / maxN; - const geoMean = Math.exp(logPrecisionSum); - - const bp = brevityPenalty(refLen, candLen); - return bp * geoMean; -} - -// if __name__ == "__main__" -if (process.argv[1] === import.meta.filename) { - const test_pairs = [ - ["the cat is on the mat", "the cat is on the mat"], - ["the cat is on the mat", "the the the the the the the"], - ["the cat is on the mat", "the cat on the mat"], - ["the cat is on the mat", "the cat is on the"], - ["the cat is on the mat", "foo bar baz qux"], - [ - "The quick brown fox jumps over the lazy dog", - "The quick brown dog jumps over the lazy fox", - ], - [ - "This could be `static` to prevent any funkiness, i.e. attempting to use class state during the constructor or similar.", - "This could be `static` to prevent any funkiness, i.e. attempting to use class state during the constructor or similar.", - ], - ]; - - for (const [reference, candidate] of test_pairs) { - const score = bleu(reference, candidate, 4); - console.log(`reference: ${reference}`); - console.log(`candidate: ${candidate}`); - console.log(`BLEU score: ${score.toFixed(4)}`); - } -} diff --git a/src/utils/errors.js b/src/utils/errors.js deleted file mode 100644 index b416da5..0000000 --- a/src/utils/errors.js +++ /dev/null @@ -1,6 +0,0 @@ -export class InvalidJsonFormatError extends Error { - constructor(message = 'JSON must be an object mapping strings to strings') { - super(message); - this.name = 'InvalidJsonFormatError'; - } -} \ No newline at end of file diff --git a/src/utils/errors.py b/src/utils/errors.py new file mode 100644 index 0000000..aefbb18 --- /dev/null +++ b/src/utils/errors.py @@ -0,0 +1,4 @@ +class InvalidJsonFormatError(Exception): + def __init__(self, message='JSON must be an object mapping strings to strings'): + super().__init__(message) + self.name = 'InvalidJsonFormatError' diff --git a/src/utils/paths.js b/src/utils/paths.js deleted file mode 100644 index ed828aa..0000000 --- a/src/utils/paths.js +++ /dev/null @@ -1,11 +0,0 @@ -import { fileURLToPath } from 'url'; -import { dirname, join } from 'path'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -// Get the project root directory (2 levels up from src/utils) -export const PROJECT_ROOT = join(__dirname, '../..'); - -// Helper function to create paths relative to project root -export const getProjectPath = (relativePath) => join(PROJECT_ROOT, relativePath); \ No newline at end of file diff --git a/src/utils/paths.py b/src/utils/paths.py new file mode 100644 index 0000000..0af0428 --- /dev/null +++ b/src/utils/paths.py @@ -0,0 +1,8 @@ +# utils/paths.py +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent + + +def get_project_path(relative_path: str) -> Path: + return PROJECT_ROOT / relative_path diff --git a/src/utils/process_data.js b/src/utils/process_data.js deleted file mode 100644 index 1316dc9..0000000 --- a/src/utils/process_data.js +++ /dev/null @@ -1,58 +0,0 @@ -import fs from "fs"; -import { getProjectPath } from "../utils/paths.js"; -import { bleu } from "../utils/bleu.js"; - -function buildReferenceMap(dataset_path) { - const referenceMap = {}; - const dataset = JSON.parse(fs.readFileSync(dataset_path)); - for (const entry of dataset.entries) { - const id = entry.metadata.id; - const comments = entry.comments; - referenceMap[id] = comments.map((c) => c.body); - } - return referenceMap; -} - -const REFERENCE_MAP = buildReferenceMap(getProjectPath("data/dataset.json")); - -export const evaluate_comments = (answers, percent_cb) => { - const total = Object.keys(answers).length; - let i = 0; - const results = {}; - for (const [id, generated_comment] of Object.entries(answers)) { - const n_tokens_generated = generated_comment.trim().split(/\s+/).length; - if (!(id in REFERENCE_MAP)) { - console.error(`id: "${id}" is not present in the dataset`); - continue; - } - const paraphrases = REFERENCE_MAP[id]; - - let maxScore = 0; - const scores = []; - for (const paraphrase of paraphrases) { - const n_tokens_paraphrase = paraphrase.trim().split(/\s+/).length; - const max_n = Math.min(n_tokens_generated, n_tokens_paraphrase, 4); - const score = bleu(paraphrase, generated_comment, max_n); - scores.push(score); - maxScore = Math.max(score, maxScore); - } - results[id] = { - max_bleu_score: maxScore, - bleu_scores: scores, - proposed_comment: generated_comment, - }; - percent_cb(Math.floor((++i / total) * 100)); - } - return results; -}; - -export const evaluate_refinement = (answers, percent_cb) => { - const total = Object.keys(answers).length; - let i = 0; - for (const [key, value] of Object.entries(answers)) { - console.log(`Processing ${key}: ${value}...`); - // await new Promise((res) => setTimeout(res, 1000)); - console.log("Done"); - percent_cb(Math.floor((++i / total) * 100)); - } -}; diff --git a/src/utils/process_data.py b/src/utils/process_data.py new file mode 100644 index 0000000..dff56b5 --- /dev/null +++ b/src/utils/process_data.py @@ -0,0 +1,46 @@ +# utils/process_data.py +import json +import sys +from .paths import get_project_path +from sacrebleu import sentence_bleu as bleu + + +def build_reference_map(dataset_path: str) -> dict[str, list[str]]: + ref_map = {} + data = json.loads(open(dataset_path).read()) + for entry in data['entries']: + id_ = entry['metadata']['id'] + comments = entry['comments'] + ref_map[id_] = [c['body'] for c in comments] + return ref_map + + +REFERENCE_MAP = build_reference_map(str(get_project_path('../data/dataset.json'))) + + +def evaluate_comments(answers: dict[str, str], percent_cb): + total = len(answers) + results = {} + for i, (id_, gen) in enumerate(answers.items(), 1): + if id_ not in REFERENCE_MAP: + print(f'id: "{id_}" is not present in the dataset', file=sys.stderr) + continue + paraphrases = REFERENCE_MAP[id_] + max_score = 0.0 + scores = [] + for p in paraphrases: + score = bleu(gen, [p]).score + scores.append(score) + max_score = max(max_score, score) + results[id_] = {'max_bleu_score': max_score, 'bleu_scores': scores, 'proposed_comment': gen} + percent_cb(int(i / total * 100)) + return results + + +def evaluate_refinement(answers: dict[str, str], percent_cb): + total = len(answers) + for i, (key, value) in enumerate(answers.items(), 1): + print(f"Processing {key}: {value}...") + # time.sleep(1) + print("Done") + percent_cb(int(i / total * 100))