ported backend to python

2025-09-06 01:57:55 +02:00 · 2025-05-13 13:27:38 +02:00
parent e5bd1d3a08
commit 3a4bfd611b
22 changed files with 330 additions and 658 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,136 +1,174 @@
-# Logs
+# Byte-compiled / optimized / DLL files
-logs
+__pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
-npm-debug.log*
+local_settings.py
-yarn-debug.log*
+db.sqlite3
-yarn-error.log*
+db.sqlite3-journal
 lerna-debug.log*
 .pnpm-debug.log*
-# Diagnostic reports (https://nodejs.org/api/report.html)
+# Flask stuff:
-report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+instance/
 .webassets-cache
-# Runtime data
+# Scrapy stuff:
-pids
+.scrapy
 *.pid
 *.seed
 *.pid.lock
-# Directory for instrumented libs generated by jscoverage/JSCover
+# Sphinx documentation
-lib-cov
+docs/_build/
-# Coverage directory used by tools like istanbul
+# PyBuilder
-coverage
+.pybuilder/
-*.lcov
+target/
-# nyc test coverage
+# Jupyter Notebook
-.nyc_output
+.ipynb_checkpoints
-# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+# IPython
-.grunt
+profile_default/
 ipython_config.py
-# Bower dependency directory (https://bower.io/)
+# pyenv
-bower_components
+#   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
-# node-waf configuration
+# pipenv
-.lock-wscript
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
-# Compiled binary addons (https://nodejs.org/api/addons.html)
+# UV
-build/Release
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #uv.lock
-# Dependency directories
+# poetry
-node_modules/
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-jspm_packages/
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
-# Snowpack dependency directory (https://snowpack.dev/)
+# pdm
-web_modules/
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
-# TypeScript cache
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-*.tsbuildinfo
+__pypackages__/
-# Optional npm cache directory
+# Celery stuff
-.npm
+celerybeat-schedule
 celerybeat.pid
-# Optional eslint cache
+# SageMath parsed files
-.eslintcache
+*.sage.py
-# Optional stylelint cache
+# Environments
 .stylelintcache
 # Microbundle cache
 .rpt2_cache/
 .rts2_cache_cjs/
 .rts2_cache_es/
 .rts2_cache_umd/
 # Optional REPL history
 .node_repl_history
 # Output of 'npm pack'
 *.tgz
 # Yarn Integrity file
 .yarn-integrity
 # dotenv environment variable files
 .env
-.env.development.local
+.venv
-.env.test.local
+env/
-.env.production.local
+venv/
-.env.local
+ENV/
 env.bak/
 venv.bak/
-# parcel-bundler cache (https://parceljs.org/)
+# Spyder project settings
-.cache
+.spyderproject
-.parcel-cache
+.spyproject
-# Next.js build output
+# Rope project settings
-.next
+.ropeproject
 out
-# Nuxt.js build / generate output
+# mkdocs documentation
-.nuxt
+/site
 dist
-# Gatsby files
+# mypy
-.cache/
+.mypy_cache/
-# Comment in the public line in if your project uses Gatsby and not Next.js
+.dmypy.json
-# https://nextjs.org/blog/next-9-1#public-directory-support
+dmypy.json
 # public
-# vuepress build output
+# Pyre type checker
-.vuepress/dist
+.pyre/
-# vuepress v2.x temp and cache directory
+# pytype static type analyzer
-.temp
+.pytype/
 .cache
-# vitepress build output
+# Cython debug symbols
-**/.vitepress/dist
+cython_debug/
-# vitepress cache directory
+# PyCharm
-**/.vitepress/cache
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-# Docusaurus cache and generated files
+# Ruff stuff:
-.docusaurus
+.ruff_cache/
-# Serverless directories
+# PyPI configuration file
-.serverless/
+.pypirc
 # FuseBox cache
 .fusebox/
 # DynamoDB Local files
 .dynamodb/
 # TernJS port file
 .tern-port
 # Stores VSCode versions used for testing VSCode extensions
 .vscode-test
 # yarn v2
 .yarn/cache
 .yarn/unplugged
 .yarn/build-state.yml
 .yarn/install-state.gz
 .pnp.*
--- a/jest.config.js
+++ b/jest.config.js
@@ -1,8 +0,0 @@
 export default {
    transform: {},
    moduleNameMapper: {
        '^(\\.{1,2}/.*)\\.js$': '$1',
    },
    testEnvironment: 'node',
    verbose: true
 }; 
--- a/package.json
+++ b/package.json
@@ -1,23 +0,0 @@
 {
    "name": "crab-webapp",
    "description": "Crab Webapp",
    "type": "module",
    "main": "src/server.js",
    "scripts": {
        "start": "node src/server.js",
        "dev": "nodemon src/server.js",
        "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js"
    },
    "dependencies": {
        "cors": "^2.8.5",
        "dotenv": "^16.3.1",
        "express": "^4.18.2",
        "multer": "^1.4.5-lts.1",
        "socket.io": "^4.8.1"
    },
    "devDependencies": {
        "jest": "^29.7.0",
        "nodemon": "^3.0.2",
        "supertest": "^6.3.4"
    }
 }
--- a/public/index.html
+++ b/public/index.html
@@ -7,7 +7,7 @@
    <link rel="icon" type="image/x-icon" href="/img/crab.png">
    <title>Dataset Downloader & Answer Uploader</title>
    <link rel="stylesheet" href="css/style.css">
-    <script src="/socket.io/socket.io.js"></script>
+    <script src="https://cdn.socket.io/4.5.4/socket.io.min.js"></script>
    <script defer src="js/index.js"></script>
    <script defer src="js/sorttable.js"></script>
 </head>
--- a/public/js/index.js
+++ b/public/js/index.js
@@ -60,7 +60,7 @@ document.getElementById("upload-btn").onclick = async () => {
        idCell.textContent = id;
        commentCell.innerHTML = `<span class='comment-cell'>${info["proposed_comment"]}</span>`;
-        scoreCell.textContent = info["max_bleu_score"].toFixed(4);
+        scoreCell.textContent = info["max_bleu_score"];
    });
 };
--- a/src/routes/tests/datasets.test.js
+++ b/src/routes/tests/datasets.test.js
@@ -1,92 +0,0 @@
 import { jest } from '@jest/globals';
 import express from 'express';
 import request from 'supertest';
 import { join } from 'path';
 import { fileURLToPath } from 'url';
 import { dirname } from 'path';
 import datasetsRouter from '../datasets.js';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 // Mock the paths utility
 jest.mock('../../utils/paths.js', () => ({
    getProjectPath: (path) => join(__dirname, '../../..', path)
 }));
 // Create Express app for testing
 const app = express();
 app.use('/datasets', datasetsRouter);
 describe('Datasets Router', () => {
    // Mock environment variables
    const originalEnv = process.env;
    beforeEach(() => {
        jest.resetModules();
        process.env = { ...originalEnv };
        process.env.DATA_DIR = './test-data';
    });
    afterEach(() => {
        process.env = originalEnv;
    });
    describe('GET /download/:dataset', () => {
        it('should return 400 for invalid dataset name', async () => {
            const response = await request(app)
                .get('/datasets/download/invalid_dataset')
                .expect(400);
            expect(response.body).toEqual({
                error: 'Invalid dataset name'
            });
        });
        it('should download comment_generation without context', async () => {
            const response = await request(app)
                .get('/datasets/download/comment_generation')
                .expect(200);
            expect(response.headers['content-type']).toBe('application/zip');
            expect(response.headers['content-disposition']).toContain('comment_generation_no_context.zip');
        });
        it('should download comment_generation with context', async () => {
            const response = await request(app)
                .get('/datasets/download/comment_generation')
                .query({ withContext: true })
                .expect(200);
            expect(response.headers['content-type']).toBe('application/zip');
            expect(response.headers['content-disposition']).toContain('comment_generation_with_context.zip');
        });
        it('should download code_refinement without context', async () => {
            const response = await request(app)
                .get('/datasets/download/code_refinement')
                .expect(200);
            expect(response.headers['content-type']).toBe('application/zip');
            expect(response.headers['content-disposition']).toContain('code_refinement_no_context.zip');
        });
        it('should download code_refinement with context', async () => {
            const response = await request(app)
                .get('/datasets/download/code_refinement')
                .query({ withContext: true })
                .expect(200);
            expect(response.headers['content-type']).toBe('application/zip');
            expect(response.headers['content-disposition']).toContain('code_refinement_with_context.zip');
        });
        it('should handle JSON boolean for withContext parameter', async () => {
            const response = await request(app)
                .get('/datasets/download/comment_generation')
                .query({ withContext: 'true' })
                .expect(200);
            expect(response.headers['content-disposition']).toContain('comment_generation_with_context.zip');
        });
    });
 }); 
--- a/src/routes/answers.js
+++ b/src/routes/answers.js
@@ -1,129 +0,0 @@
 import { Router } from "express";
 import multer from "multer";
 import { InvalidJsonFormatError } from "../utils/errors.js";
 import { evaluate_comments } from "../utils/process_data.js";
 const router = Router();
 // Configure multer for file uploads
 const upload = multer({
    storage: multer.memoryStorage(),
    limits: {
        fileSize: 200 * 1024 * 1024, // 200MB limit, since the comement gen is 147MB (deflated)
    },
    fileFilter: (_req, file, cb) => {
        // Accept only JSON files
        if (file.mimetype === "application/json") {
            cb(null, true);
        } else {
            cb(new Error("Only JSON files are allowed"));
        }
    },
 });
 // Helper function to validate JSON format
 const validateJsonFormat = (data) => {
    try {
        const parsed = JSON.parse(data);
        // Check if it's an object
        if (
            typeof parsed !== "object" ||
            parsed === null ||
            Array.isArray(parsed)
        ) {
            throw new InvalidJsonFormatError(
                "Submitted json doesn't contain an object",
            );
        }
        // Check if all values are strings
        if (
            !Object.values(parsed).every((value) => typeof value === "string")
        ) {
            throw new InvalidJsonFormatError(
                "Submitted json object must only be str -> str. Namely id -> comment",
            );
        }
        return parsed;
    } catch (error) {
        if (error instanceof InvalidJsonFormatError) {
            throw error;
        }
        throw new InvalidJsonFormatError("Invalid JSON format");
    }
 };
 router.post("/submit/comments", upload.single("file"), async (req, res) => {
    try {
        if (!req.file) {
            return res.status(400).json({ error: "No file uploaded" });
        }
        const fileContent = req.file.buffer.toString();
        let validatedData;
        try {
            validatedData = validateJsonFormat(fileContent);
        } catch (error) {
            if (error instanceof InvalidJsonFormatError) {
                return res.status(400).json({
                    error: "Invalid JSON format",
                    message: error.message,
                });
            }
            throw error;
        }
        const io = req.app.get("io");
        const header = req.get("X-Socket-Id");
        const socketId = header && header.trim();
        if (socketId && io.sockets.sockets.has(socketId)) {
            io.to(socketId).emit("successul-upload");
            io.to(socketId).emit("started-processing");
        }
        const results = evaluate_comments(validatedData, (percent) => {
            if (!(socketId && io.sockets.sockets.has(socketId))) return;
            io.to(socketId).emit("progress", { percent });
        });
        res.status(200).json(results);
    } catch (error) {
        console.error("Error processing submission:", error);
        res.status(500).json({ error: "Error processing submission" });
    }
 });
 router.post("/submit/refinement", upload.single("file"), async (req, res) => {
    try {
        if (!req.file) {
            return res.status(400).json({ error: "No file uploaded" });
        }
        const fileContent = req.file.buffer.toString();
        let validatedData;
        try {
            validatedData = validateJsonFormat(fileContent);
        } catch (error) {
            if (error instanceof InvalidJsonFormatError) {
                return res.status(400).json({
                    error: "Invalid JSON format",
                    message: error.message,
                });
            }
            throw error;
        }
        socket.emit("started-processing");
        evaluate_comments(validatedData);
        res.status(200).json({
            message: "Answer submitted successfully",
            data: validatedData,
        });
    } catch (error) {
        console.error("Error processing submission:", error);
        res.status(500).json({ error: "Error processing submission" });
    }
 });
 export default router;
--- a/src/routes/answers.py
+++ b/src/routes/answers.py
@@ -0,0 +1,55 @@
 # routes/answers.py
 from flask import Blueprint, request, jsonify, current_app
 from utils.errors import InvalidJsonFormatError
 from utils.process_data import evaluate_comments
 import json
 router = Blueprint('answers', __name__, url_prefix='/answers')
 ALLOWED_EXT = {'json'}
 def validate_json_format(data: str) -> dict[str, str]:
    try:
        obj = json.loads(data)
        if not isinstance(obj, dict):
            raise InvalidJsonFormatError("Submitted json doesn't contain an object")
        if not all(isinstance(v, str) for v in obj.values()):
            raise InvalidJsonFormatError(
                "Submitted json object must only be str -> str. Namely id -> comment"
            )
        return obj
    except InvalidJsonFormatError as e:
        raise e
    except Exception:
        raise InvalidJsonFormatError()
@router.route('/submit/comments', methods=['POST'])
 def submit_comments():
    file = request.files.get('file')
    if file is None or file.filename is None or file.filename.split('.')[-1] not in ALLOWED_EXT:
        return jsonify({'error': 'Only JSON files are allowed'}), 400
    data = file.read().decode()
    try:
        validated = validate_json_format(data)
    except InvalidJsonFormatError as e:
        return jsonify({'error': 'Invalid JSON format', 'message': str(e)}), 400
    socketio = current_app.extensions['socketio']
    sid = request.headers.get('X-Socket-Id')
    if sid:
        socketio.emit('successful-upload', room=sid)
        socketio.emit('started-processing', room=sid)
    results = evaluate_comments(
        validated, lambda p: socketio.emit('progress', {'percent': p}, room=sid)
    )
    return jsonify(results)
@router.route('/submit/refinement', methods=['POST'])
 def submit_refinement():
    file = request.files.get('file')
    # similar to above
    return jsonify({'message': 'Answer submitted successfully'})
--- a/src/routes/datasets.js
+++ b/src/routes/datasets.js
@@ -1,33 +0,0 @@
 import { Router } from "express";
 import { join } from "path";
 import { getProjectPath } from "../utils/paths.js";
 const router = Router();
 // Environment variables for paths (all relative to project root)
 const DATA_DIR = getProjectPath("data");
 const DATASETS = ["comment_generation", "code_refinement"];
 router.get("/download/:dataset", async (req, res) => {
    const { dataset } = req.params;
    const withContext = req.query.withContext
        ? JSON.parse(req.query.withContext)
        : false;
    if (!DATASETS.includes(dataset)) {
        return res.status(400).json({ error: "Invalid dataset name" });
    }
    const fileName = `${dataset}_${withContext ? "with_context" : "no_context"}.zip`;
    const filePath = join(DATA_DIR, fileName);
    try {
        res.download(filePath);
    } catch (error) {
        console.error("Error serving file:", error);
        res.status(500).json({ error: "Error serving file" });
    }
 });
 export default router;
--- a/src/routes/datasets.py
+++ b/src/routes/datasets.py
@@ -0,0 +1,17 @@
 # routes/datasets.py
 from flask import Blueprint, send_from_directory, request, jsonify
 from utils.paths import get_project_path
 router = Blueprint('datasets', __name__, url_prefix='/datasets')
 DATASETS = {'comment_generation', 'code_refinement'}
 DATA_DIR = get_project_path('../data')
@router.route('/download/<dataset>')
 def download(dataset):
    if dataset not in DATASETS:
        return jsonify({'error': 'Invalid dataset name'}), 400
    with_ctx = request.args.get('withContext', 'false').lower() == 'true'
    fname = f"{dataset}_{'with_context' if with_ctx else 'no_context'}.zip"
    return send_from_directory(DATA_DIR, fname, as_attachment=True)
--- a/src/routes/index.js
+++ b/src/routes/index.js
@@ -1,23 +0,0 @@
 import { Router } from 'express';
 import datasetRoutes from './datasets.js';
 import answerRoutes from './answers.js';
 const router = Router();
 // Routes
 router.get('/', (_req, res) => {
    res.json({ message: 'Welcome to the Express backend!' });
 });
 // Example route
 router.get('/api/hello', (_req, res) => {
    res.json({ message: 'Hello from the backend!' });
 });
 // Dataset routes
 router.use('/datasets', datasetRoutes);
 // Answer submission routes
 router.use('/answers', answerRoutes);
 export default router; 
--- a/src/routes/index.py
+++ b/src/routes/index.py
@@ -0,0 +1,16 @@
 # routes/index.py
 from flask import Blueprint, jsonify, current_app
 router = Blueprint('index', __name__)
@router.route('/')
 def welcome():
    print("hello")
    return current_app.send_static_file('index.html')
@router.route('/api/hello')
 def hello():
    return jsonify({'message': 'Hello from the backend!'})
--- a/src/server.js
+++ b/src/server.js
@@ -1,25 +0,0 @@
 import express, { json } from "express";
 import cors from "cors";
 import dotenv from "dotenv";
 import routes from "./routes/index.js";
 import { createSocketServer } from "./socket.js";
 dotenv.config();
 const app = express();
 const port = process.env.PORT || 3000;
 // Middleware
 app.use(cors());
 app.use(json());
 // Use routes
 app.use(express.static("public"));
 app.use("/", routes);
 const server = createSocketServer(app);
 // Start server
 server.listen(port, () => {
    console.log(`Server is running on port ${port}`);
 });
--- a/src/server.py
+++ b/src/server.py
@@ -0,0 +1,35 @@
 # server.py
 from flask import Flask
 from flask_cors import CORS
 from flask_socketio import SocketIO
 from routes.index import router as index_router
 from routes.answers import router as answers_router
 from routes.datasets import router as datasets_router
 import os
 app = Flask(__name__, static_folder='../public', static_url_path='/')
 CORS(app)
 # Register routes
 app.register_blueprint(index_router)        # serves '/' and '/api/hello'
 app.register_blueprint(answers_router)      # mounts at '/answers'
 app.register_blueprint(datasets_router)     # mounts at '/datasets'
 def init_socketio(app):
    socketio = SocketIO(app, cors_allowed_origins='*')
    @socketio.on('connect')
    def _():
        print('Websocket client connected')
    return socketio
 # Init socketio
 socketio = init_socketio(app)
 if __name__ == '__main__':
    port = int(os.getenv('PORT', 3000))
    socketio.run(app, port=port)
--- a/src/socket.js
+++ b/src/socket.js
@@ -1,14 +0,0 @@
 import http from "http";
 import { Server } from "socket.io";
 function onConnect(socket) {
    console.log("Websocket client connected:", socket.id);
 }
 export function createSocketServer(app) {
    const httpServer = http.createServer(app);
    const io = new Server(httpServer);
    io.on("connection", onConnect);
    app.set("io", io);
    return httpServer;
 }
--- a/src/utils/bleu.js
+++ b/src/utils/bleu.js
@@ -1,125 +0,0 @@
 /*
 * Calculates BLEU score between a reference and candidate sentence.
 * Reference and candidate should be token arrays (e.g. split by whitespace).
 * We compute modified n-gram precisions for n=1..4, geometric mean, with smoothing (optional).
 * We include the brevity penalty.
 */
 /**
 * Extracts n-grams from a sequence of tokens.
 * @param {string[]} tokens - Array of tokens.
 * @param {number} n - Size of the n-gram.
 * @returns {Object} Map from n-gram string to its count.
 */
 function getNGramCounts(tokens, n) {
    const counts = Object.create(null);
    for (let i = 0; i + n <= tokens.length; i++) {
        const gram = tokens.slice(i, i + n).join(" ");
        counts[gram] = (counts[gram] || 0) + 1;
    }
    return counts;
 }
 /**
 * Computes modified precision for a given n.
 * @param {string[]} reference - Reference token array.
 * @param {string[]} candidate - Candidate token array.
 * @param {number} n - n-gram order.
 * @returns {number} Modified n-gram precision.
 */
 function modifiedPrecision(reference, candidate, n) {
    const refCounts = getNGramCounts(reference, n);
    const candCounts = getNGramCounts(candidate, n);
    let matchCount = 0;
    let totalCount = 0;
    for (const gram in candCounts) {
        const countCand = candCounts[gram];
        const countRef = refCounts[gram] || 0;
        matchCount += Math.min(countCand, countRef);
        totalCount += countCand;
    }
    // Avoid division by zero
    return totalCount === 0 ? 0 : matchCount / totalCount;
 }
 /**
 * Computes brevity penalty.
 * @param {number} refLength - Length of reference sentence.
 * @param {number} candLength - Length of candidate sentence.
 * @returns {number} Brevity penalty.
 */
 function brevityPenalty(refLength, candLength) {
    if (candLength > refLength) {
        return 1;
    }
    if (candLength === 0) {
        return 0;
    }
    return Math.exp(1 - refLength / candLength);
 }
 /**
 * Computes BLEU score.
 * @param {string} refSentence - Reference sentence.
 * @param {string} candSentence - Candidate sentence.
 * @param {number} maxN - Maximum n-gram order (default=4).
 * @param {boolean} smooth - Whether to apply smoothing (default=false).
 * @returns {number} BLEU score between 0 and 1.
 */
 export function bleu(refSentence, candSentence, maxN = 4, smooth = false) {
    const reference = refSentence.trim().split(/\s+/);
    const candidate = candSentence.trim().split(/\s+/);
    const refLen = reference.length;
    const candLen = candidate.length;
    // count how many times we've hit a zero count so far
    const precisions = [];
    for (let n = 1; n <= maxN; n++) {
        let p = modifiedPrecision(reference, candidate, n);
        if (p === 0 && smooth) {
            p = 1 / Math.pow(candLen, n);
        }
        precisions.push(p);
    }
    // Compute geometric mean of precisions
    // if any precision is zero (and no smoothing), BLEU=0
    if (precisions.some((p) => p === 0)) {
        return 0;
    }
    const logPrecisionSum =
        precisions.map((p) => Math.log(p)).reduce((a, b) => a + b, 0) / maxN;
    const geoMean = Math.exp(logPrecisionSum);
    const bp = brevityPenalty(refLen, candLen);
    return bp * geoMean;
 }
 // if __name__ == "__main__"
 if (process.argv[1] === import.meta.filename) {
    const test_pairs = [
        ["the cat is on the mat", "the cat is on the mat"],
        ["the cat is on the mat", "the the the the the the the"],
        ["the cat is on the mat", "the cat on the mat"],
        ["the cat is on the mat", "the cat is on the"],
        ["the cat is on the mat", "foo bar baz qux"],
        [
            "The quick brown fox jumps over the lazy dog",
            "The quick brown dog jumps over the lazy fox",
        ],
        [
            "This could be `static` to prevent any funkiness, i.e. attempting to use class state during the constructor or similar.",
            "This could be `static` to prevent any funkiness, i.e. attempting to use class state during the constructor or similar.",
        ],
    ];
    for (const [reference, candidate] of test_pairs) {
        const score = bleu(reference, candidate, 4);
        console.log(`reference: ${reference}`);
        console.log(`candidate: ${candidate}`);
        console.log(`BLEU score: ${score.toFixed(4)}`);
    }
 }
--- a/src/utils/errors.js
+++ b/src/utils/errors.js
@@ -1,6 +0,0 @@
 export class InvalidJsonFormatError extends Error {
    constructor(message = 'JSON must be an object mapping strings to strings') {
        super(message);
        this.name = 'InvalidJsonFormatError';
    }
 } 
--- a/src/utils/errors.py
+++ b/src/utils/errors.py
@@ -0,0 +1,4 @@
 class InvalidJsonFormatError(Exception):
    def __init__(self, message='JSON must be an object mapping strings to strings'):
        super().__init__(message)
        self.name = 'InvalidJsonFormatError'
--- a/src/utils/paths.js
+++ b/src/utils/paths.js
@@ -1,11 +0,0 @@
 import { fileURLToPath } from 'url';
 import { dirname, join } from 'path';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 // Get the project root directory (2 levels up from src/utils)
 export const PROJECT_ROOT = join(__dirname, '../..');
 // Helper function to create paths relative to project root
 export const getProjectPath = (relativePath) => join(PROJECT_ROOT, relativePath); 
--- a/src/utils/paths.py
+++ b/src/utils/paths.py
@@ -0,0 +1,8 @@
 # utils/paths.py
 from pathlib import Path
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 def get_project_path(relative_path: str) -> Path:
    return PROJECT_ROOT / relative_path
--- a/src/utils/process_data.js
+++ b/src/utils/process_data.js
@@ -1,58 +0,0 @@
 import fs from "fs";
 import { getProjectPath } from "../utils/paths.js";
 import { bleu } from "../utils/bleu.js";
 function buildReferenceMap(dataset_path) {
    const referenceMap = {};
    const dataset = JSON.parse(fs.readFileSync(dataset_path));
    for (const entry of dataset.entries) {
        const id = entry.metadata.id;
        const comments = entry.comments;
        referenceMap[id] = comments.map((c) => c.body);
    }
    return referenceMap;
 }
 const REFERENCE_MAP = buildReferenceMap(getProjectPath("data/dataset.json"));
 export const evaluate_comments = (answers, percent_cb) => {
    const total = Object.keys(answers).length;
    let i = 0;
    const results = {};
    for (const [id, generated_comment] of Object.entries(answers)) {
        const n_tokens_generated = generated_comment.trim().split(/\s+/).length;
        if (!(id in REFERENCE_MAP)) {
            console.error(`id: "${id}" is not present in the dataset`);
            continue;
        }
        const paraphrases = REFERENCE_MAP[id];
        let maxScore = 0;
        const scores = [];
        for (const paraphrase of paraphrases) {
            const n_tokens_paraphrase = paraphrase.trim().split(/\s+/).length;
            const max_n = Math.min(n_tokens_generated, n_tokens_paraphrase, 4);
            const score = bleu(paraphrase, generated_comment, max_n);
            scores.push(score);
            maxScore = Math.max(score, maxScore);
        }
        results[id] = {
            max_bleu_score: maxScore,
            bleu_scores: scores,
            proposed_comment: generated_comment,
        };
        percent_cb(Math.floor((++i / total) * 100));
    }
    return results;
 };
 export const evaluate_refinement = (answers, percent_cb) => {
    const total = Object.keys(answers).length;
    let i = 0;
    for (const [key, value] of Object.entries(answers)) {
        console.log(`Processing ${key}: ${value}...`);
        // await new Promise((res) => setTimeout(res, 1000));
        console.log("Done");
        percent_cb(Math.floor((++i / total) * 100));
    }
 };
--- a/src/utils/process_data.py
+++ b/src/utils/process_data.py
@@ -0,0 +1,46 @@
 # utils/process_data.py
 import json
 import sys
 from .paths import get_project_path
 from sacrebleu import sentence_bleu as bleu
 def build_reference_map(dataset_path: str) -> dict[str, list[str]]:
    ref_map = {}
    data = json.loads(open(dataset_path).read())
    for entry in data['entries']:
        id_ = entry['metadata']['id']
        comments = entry['comments']
        ref_map[id_] = [c['body'] for c in comments]
    return ref_map
 REFERENCE_MAP = build_reference_map(str(get_project_path('../data/dataset.json')))
 def evaluate_comments(answers: dict[str, str], percent_cb):
    total = len(answers)
    results = {}
    for i, (id_, gen) in enumerate(answers.items(), 1):
        if id_ not in REFERENCE_MAP:
            print(f'id: "{id_}" is not present in the dataset', file=sys.stderr)
            continue
        paraphrases = REFERENCE_MAP[id_]
        max_score = 0.0
        scores = []
        for p in paraphrases:
            score = bleu(gen, [p]).score
            scores.append(score)
            max_score = max(max_score, score)
        results[id_] = {'max_bleu_score': max_score, 'bleu_scores': scores, 'proposed_comment': gen}
        percent_cb(int(i / total * 100))
    return results
 def evaluate_refinement(answers: dict[str, str], percent_cb):
    total = len(answers)
    for i, (key, value) in enumerate(answers.items(), 1):
        print(f"Processing {key}: {value}...")
        # time.sleep(1)
        print("Done")
        percent_cb(int(i / total * 100))