//@ts-check // Helpers to work with different data types // by Humans for All // /** * Given the limited context size of local LLMs and , many a times when context gets filled * between the prompt and the response, it can lead to repeating text garbage generation. * And many a times setting penalty wrt repeatation leads to over-intelligent garbage * repeatation with slight variations. These garbage inturn can lead to overloading of the * available model context, leading to less valuable response for subsequent prompts/queries, * if chat history is sent to ai model. * * So two simple minded garbage trimming logics are experimented below. * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and * * another based on char-histogram-driven garbage trimming. * * in future characteristic of histogram over varying lengths could be used to allow for * a more aggressive and adaptive trimming logic. */ /** * Simple minded logic to help remove repeating garbage at end of the string. * The repeatation needs to be perfectly matching. * * The logic progressively goes on probing for longer and longer substring based * repeatation, till there is no longer repeatation. Inturn picks the one with * the longest chain. * * @param {string} sIn * @param {number} maxSubL * @param {number} maxMatchLenThreshold */ export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) { let rCnt = [0]; let maxMatchLen = maxSubL; let iMML = -1; for(let subL=1; subL < maxSubL; subL++) { rCnt.push(0); let i; let refS = sIn.substring(sIn.length-subL, sIn.length); for(i=sIn.length; i > 0; i -= subL) { let curS = sIn.substring(i-subL, i); if (refS != curS) { let curMatchLen = rCnt[subL]*subL; if (maxMatchLen < curMatchLen) { maxMatchLen = curMatchLen; iMML = subL; } break; } rCnt[subL] += 1; } } console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt); if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) { return {trimmed: false, data: sIn}; } console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen); let iEnd = sIn.length - maxMatchLen; return { trimmed: true, data: sIn.substring(0, iEnd) }; } /** * Simple minded logic to help remove repeating garbage at end of the string, till it cant. * If its not able to trim, then it will try to skip a char at end and then trim, a few times. * This ensures that even if there are multiple runs of garbage with different patterns, the * logic still tries to munch through them. * * @param {string} sIn * @param {number} maxSubL * @param {number | undefined} [maxMatchLenThreshold] */ export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) { let sCur = sIn; let sSaved = ""; let iTry = 0; while(true) { let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold); if (got.trimmed != true) { if (iTry == 0) { sSaved = got.data; } iTry += 1; if (iTry >= skipMax) { return sSaved; } got.data = got.data.substring(0,got.data.length-1); } else { iTry = 0; } sCur = got.data; } } /** * A simple minded try trim garbage at end using histogram driven characteristics. * There can be variation in the repeatations, as long as no new char props up. * * This tracks the chars and their frequency in a specified length of substring at the end * and inturn checks if moving further into the generated text from the end remains within * the same char subset or goes beyond it and based on that either trims the string at the * end or not. This allows to filter garbage at the end, including even if there are certain * kind of small variations in the repeated text wrt position of seen chars. * * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that * a given type of char ie numerals or alphabets or other types dont cross the specified * maxType limit. This allows intermixed text garbage to be identified and trimmed. * * ALERT: This is not perfect and only provides a rough garbage identification logic. * Also it currently only differentiates between character classes wrt english. * * @param {string} sIn * @param {number} maxType * @param {number} maxUniq * @param {number} maxMatchLenThreshold */ export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) { if (sIn.length < maxMatchLenThreshold) { return { trimmed: false, data: sIn }; } let iAlp = 0; let iNum = 0; let iOth = 0; // Learn let hist = {}; let iUniq = 0; for(let i=0; i<maxMatchLenThreshold; i++) { let c = sIn[sIn.length-1-i]; if (c in hist) { hist[c] += 1; } else { if(c.match(/[0-9]/) != null) { iNum += 1; } else if(c.match(/[A-Za-z]/) != null) { iAlp += 1; } else { iOth += 1; } iUniq += 1; if (iUniq >= maxUniq) { break; } hist[c] = 1; } } console.debug("DBUG:TrimHistGarbage:", hist); if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) { return { trimmed: false, data: sIn }; } // Catch and Trim for(let i=0; i < sIn.length; i++) { let c = sIn[sIn.length-1-i]; if (!(c in hist)) { if (i < maxMatchLenThreshold) { return { trimmed: false, data: sIn }; } console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i); return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) }; } } console.debug("DBUG:TrimHistGarbage:Trimmed fully"); return { trimmed: true, data: "" }; } /** * Keep trimming repeatedly using hist_garbage logic, till you no longer can. * This ensures that even if there are multiple runs of garbage with different patterns, * the logic still tries to munch through them. * * @param {any} sIn * @param {number} maxType * @param {number} maxUniq * @param {number} maxMatchLenThreshold */ export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) { let sCur = sIn; while (true) { let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold); if (!got.trimmed) { return got.data; } sCur = got.data; } } /** * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying. * @param {string} sIn */ export function trim_garbage_at_end(sIn) { let sCur = sIn; for(let i=0; i<2; i++) { sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72); sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12); } return sCur; } /** * NewLines array helper. * Allow for maintaining a list of lines. * Allow for a line to be builtup/appended part by part. */ export class NewLines { constructor() { /** @type {string[]} */ this.lines = []; } /** * Extracts lines from the passed string and inturn either * append to a previous partial line or add a new line. * @param {string} sLines */ add_append(sLines) { let aLines = sLines.split("\n"); let lCnt = 0; for(let line of aLines) { lCnt += 1; // Add back newline removed if any during split if (lCnt < aLines.length) { line += "\n"; } else { if (sLines.endsWith("\n")) { line += "\n"; } } // Append if required if (lCnt == 1) { let lastLine = this.lines[this.lines.length-1]; if (lastLine != undefined) { if (!lastLine.endsWith("\n")) { this.lines[this.lines.length-1] += line; continue; } } } // Add new line this.lines.push(line); } } /** * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest] * Optionally control whether only full lines (ie those with newline at end) will be returned * or will a partial line without a newline at end (can only be the last line) be returned. * @param {boolean} bFullWithNewLineOnly */ shift(bFullWithNewLineOnly=true) { let line = this.lines[0]; if (line == undefined) { return undefined; } if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){ return undefined; } return this.lines.shift(); } }