From 719e1270192d47cb10b1ddafe6ed7a223877b320 Mon Sep 17 00:00:00 2001 From: Karma Riuk Date: Sat, 10 May 2025 21:59:37 +0200 Subject: [PATCH] removed bleu-score package and wrote my own that didn't have a bug in it (issue with saving "constructor" as a key of a normal object) --- package.json | 1 - src/utils/bleu.js | 126 ++++++++++++++++++++++++++++++++++++++ src/utils/process_data.js | 3 +- 3 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 src/utils/bleu.js diff --git a/package.json b/package.json index 9ffbfd1..93f6a76 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,6 @@ "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js" }, "dependencies": { - "bleu-score": "^1.0.4", "cors": "^2.8.5", "dotenv": "^16.3.1", "express": "^4.18.2", diff --git a/src/utils/bleu.js b/src/utils/bleu.js new file mode 100644 index 0000000..7f8aa35 --- /dev/null +++ b/src/utils/bleu.js @@ -0,0 +1,126 @@ +/* + * Calculates BLEU score between a reference and candidate sentence. + * Reference and candidate should be token arrays (e.g. split by whitespace). + * We compute modified n-gram precisions for n=1..4, geometric mean, with smoothing (optional). + * We include the brevity penalty. + */ + +/** + * Extracts n-grams from a sequence of tokens. + * @param {string[]} tokens - Array of tokens. + * @param {number} n - Size of the n-gram. + * @returns {Object} Map from n-gram string to its count. + */ +function getNGramCounts(tokens, n) { + const counts = Object.create(null); + for (let i = 0; i + n <= tokens.length; i++) { + const gram = tokens.slice(i, i + n).join(" "); + counts[gram] = (counts[gram] || 0) + 1; + } + return counts; +} + +/** + * Computes modified precision for a given n. + * @param {string[]} reference - Reference token array. + * @param {string[]} candidate - Candidate token array. + * @param {number} n - n-gram order. + * @returns {number} Modified n-gram precision. + */ +function modifiedPrecision(reference, candidate, n) { + const refCounts = getNGramCounts(reference, n); + const candCounts = getNGramCounts(candidate, n); + let matchCount = 0; + let totalCount = 0; + + for (const gram in candCounts) { + const countCand = candCounts[gram]; + const countRef = refCounts[gram] || 0; + matchCount += Math.min(countCand, countRef); + totalCount += countCand; + } + + // Avoid division by zero + return totalCount === 0 ? 0 : matchCount / totalCount; +} + +/** + * Computes brevity penalty. + * @param {number} refLength - Length of reference sentence. + * @param {number} candLength - Length of candidate sentence. + * @returns {number} Brevity penalty. + */ +function brevityPenalty(refLength, candLength) { + if (candLength > refLength) { + return 1; + } + if (candLength === 0) { + return 0; + } + return Math.exp(1 - refLength / candLength); +} + +/** + * Computes BLEU score. + * @param {string} refSentence - Reference sentence. + * @param {string} candSentence - Candidate sentence. + * @param {number} maxN - Maximum n-gram order (default=4). + * @param {boolean} smooth - Whether to apply smoothing (default=false). + * @returns {number} BLEU score between 0 and 1. + */ +export function bleu(refSentence, candSentence, maxN = 4, smooth = false) { + const reference = refSentence.trim().split(/\s+/); + const candidate = candSentence.trim().split(/\s+/); + const refLen = reference.length; + const candLen = candidate.length; + + // count how many times we've hit a zero count so far + const precisions = []; + for (let n = 1; n <= maxN; n++) { + let p = modifiedPrecision(reference, candidate, n); + if (p === 0 && smooth) { + p = 1 / Math.pow(candLen, n); + } + precisions.push(p); + } + + // Compute geometric mean of precisions + // if any precision is zero (and no smoothing), BLEU=0 + if (precisions.some((p) => p === 0)) { + return 0; + } + + const logPrecisionSum = + precisions.map((p) => Math.log(p)).reduce((a, b) => a + b, 0) / maxN; + const geoMean = Math.exp(logPrecisionSum); + + const bp = brevityPenalty(refLen, candLen); + return bp * geoMean; +} + +// Example usage: + +if (process.argv[1] === import.meta.filename) { + const test_pairs = [ + // ["the cat is on the mat", "the cat is on the mat"], + // ["the cat is on the mat", "the the the the the the the"], + // ["the cat is on the mat", "the cat on the mat"], + // ["the cat is on the mat", "the cat is on the"], + // ["the cat is on the mat", "foo bar baz qux"], + // [ + // "The quick brown fox jumps over the lazy dog", + // "The quick brown dog jumps over the lazy fox", + // ], + [ + "This could be `static` to prevent any funkiness, i.e. attempting to use class state during the constructor or similar.", + "This could be `static` to prevent any funkiness, i.e. attempting to use class state during the constructor or similar.", + ], + ]; + + for (const [reference, candidate] of test_pairs) { + const score = bleu(reference, candidate, 4); + console.log(`reference: ${reference}`); + console.log(`candidate: ${candidate}`); + console.log(`BLEU score: ${score.toFixed(4)}`); + } +} diff --git a/src/utils/process_data.js b/src/utils/process_data.js index 55028ce..bc94487 100644 --- a/src/utils/process_data.js +++ b/src/utils/process_data.js @@ -1,7 +1,6 @@ import fs from "fs"; import { getProjectPath } from "../utils/paths.js"; -import { bleu } from "bleu-score"; - +import { bleu } from "../utils/bleu.js"; function buildReferenceMap(dataset_path) { const referenceMap = {};