diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts index 0c492c76255..e21b9709b0b 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts @@ -35,11 +35,42 @@ function textToCharTransforms(text: string, transformId?: number): Transform[] { [...text].map(insert => ({insert, deleteLeft: 0})); } + +/** + * Defines an interface compatible with ContextToken that is useful for handling + * cases that should not be considered correctable. + */ +export interface ContextTokenLike { + /** + * Generates text corresponding to the net effects of the most likely inputs + * received that can correspond to the represented token. + */ + exampleInput: string; + + /** + * Reports the length in codepoints of corrected text represented by the + * current token. + */ + codepointLength: number; + + /** + * Whether or not the token is likely still being edited by the user (due to + * adjacency of the caret) + */ + isPartial?: boolean; + + /** + * Gets a compact string-based representation of `inputRange` that + * maps compatible token source ranges to each other. + */ + sourceRangeKey?: string; +} + /** * Represents cached data about one token (either a word or a unit of whitespace) * in the context and associated correction-search progress and results. */ -export class ContextToken { +export class ContextToken implements ContextTokenLike { /** * Indicates whether or not the token is considered whitespace. */ @@ -54,6 +85,10 @@ export class ContextToken { } private _searchModule: SearchQuotientNode; + /** + * Whether or not the token is likely still being edited by the user (due to + * adjacency of the caret) + */ isPartial: boolean; /** @@ -118,6 +153,14 @@ export class ContextToken { return new ContextToken(searchModule, isPartial); } + /** + * Reports the length in codepoints of corrected text represented by the + * current token. + */ + get codepointLength() { + return this._searchModule.codepointLength; + } + get inputCount() { return this._searchModule.inputCount; } @@ -155,7 +198,7 @@ export class ContextToken { /** * Generates text corresponding to the net effects of the most likely inputs - * received that can correspond to the current instance. + * received that can correspond to the represented token. */ get exampleInput(): string { return this.searchModule.bestExample.text; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 45648ab1a2e..b9d8919b9a3 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -5,9 +5,9 @@ import { searchForProperty, WordBreakProperty } from '@keymanapp/models-wordbrea import { TransformUtils } from './transformUtils.js'; import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; +import { ContextTokenLike } from './correction/context-token.js'; import { ContextTokenization } from './correction/context-tokenization.js'; import { ContextTracker } from './correction/context-tracker.js'; -import { ContextToken } from './correction/context-token.js'; import { ContextState, determineContextSlideTransform } from './correction/context-state.js'; import { ContextTransition } from './correction/context-transition.js'; import { ExecutionTimer } from './correction/execution-timer.js'; @@ -73,6 +73,43 @@ export const CORRECTION_SEARCH_THRESHOLDS = { REPLACEMENT_SEARCH_THRESHOLD: 4 as const // e^-4 = 0.0183156388. Allows "80%" of an extra edit. } +/** + * Represents the minimum replacement range and effects required for + * suggestions. + * + * These values are based on properties of the transition from their base + * context-tokenization to their target-tokenization (and its represented + * context variant). + */ +export interface SuggestionReplacement { + /** + * Tokens lost from the base context-tokenization in the target + * context-tokenization due to the transition event. + * + * These are implicitly replaced when applying Suggestions. + */ + tokensToRemove: T[], + + /** + * Tokens added (after the removed tokens) to the base context-tokenization to + * produce the target context-tokenization. + * + * As these are "new" tokens generated by the transition, Suggestions should represent + * corrections and predictions rooted upon these tokens. + */ + tokensToPredict: T[], + + /** + * Indicates the total range of left-deletion needed when applying suggestions. + */ + deleteLeft: number, + + /** + * Indicates the id of the underlying context transition. + */ + transitionId?: number +} + /** * Collates information related to suggestions during the suggestion generation * process. @@ -395,53 +432,67 @@ export function determineSuggestionAlignment( * @param variantForSuggestions * @returns */ -export function determineSuggestionRange( - userContextTokenization: ContextTokenization, - variantForSuggestions: ContextTokenization -): { tokensToRemove: ContextToken[], tokensToPredict: ContextToken[] } { - // Assumption: spaceIds monotonically increase as new ones are generated. - // Given this, we backtrace on the token tails until finding a spot where the - // spaceIds match, dropping any that are newer than the last found in the - // other. - // - // We full-replace all tokens affected by an applied suggestion, so if there's - // a mismatch between the final form of a token, that implies that suggestions - // would replace the original form of the token anyway. - const tokenSetA = userContextTokenization.tokens.slice(); - const tokenSetB = variantForSuggestions.tokens.slice(); - - const tokensToRemove: ContextToken[] = []; - const tokensToPredict: ContextToken[] = []; - - const tailIdFor = (tokens: ContextToken[]) => tokens[tokens.length-1]?.spaceId ?? -1; - let tailOfA = tailIdFor(tokenSetA); - let tailOfB = tailIdFor(tokenSetB); - while(tailOfA != tailOfB) { - if(tailOfA < tailOfB) { - tokensToPredict.push(tokenSetB.pop()); - tailOfB = tailIdFor(tokenSetB); - } else { - tokensToRemove.push(tokenSetA.pop()); - tailOfA = tailIdFor(tokenSetA); +export function determineSuggestionRange( + userContextTokenization: T[], + variantForSuggestions: T[], + equalityChecker: (a: T, b: T) => boolean +): SuggestionReplacement { + // Add null/undefined guards to the equality checker. + const temp = equalityChecker; + equalityChecker = (a, b) => { + if(!a || !b) { + return false; + } + + return temp(a, b); + } + + const deleteLeftCalc = (tokenSet: T[], predictCount: number) => { + // TODO: once we start activating multi-tokenization for real, only the + // 'reduce' component should remain. + return (predictCount > 1) + ? (tokenSet[tokenSet.length - 1]?.codepointLength ?? 0) + : tokenSet.reduce((prev, curr) => prev + curr.codepointLength, 0); + } + + const tokenSetA = userContextTokenization.slice(); + const tokenSetB = variantForSuggestions.slice(); + + let aHeadIndexInB = tokenSetB.findIndex((t) => equalityChecker(t, tokenSetA[0])); + let bHeadIndexInA = tokenSetA.findIndex((t) => equalityChecker(t, tokenSetB[0])); + + if(aHeadIndexInB == -1 && bHeadIndexInA == -1) { + // Both are full replacements. + return { + tokensToRemove: tokenSetA, + tokensToPredict: tokenSetB, + deleteLeft: deleteLeftCalc(tokenSetA, tokenSetB.length) } + } else if(aHeadIndexInB != 0 && bHeadIndexInA != 0) { + throw new Error("Leading edge of context should not differ in both tokenizations."); + } + + let tailOffset = 0; + while(equalityChecker(tokenSetA[bHeadIndexInA + tailOffset], tokenSetB[aHeadIndexInB + tailOffset])) { + tailOffset++; } - tokensToPredict.reverse(); + const tokensToRemove: T[] = tokenSetA.slice(bHeadIndexInA + tailOffset); + const tokensToPredict: T[] = tokenSetB.slice(aHeadIndexInB + tailOffset); // Can occur when backspacing to the end of a previous word. if(tokensToPredict.length == 0) { if(tokenSetA.length == 0 || tokenSetB.length == 0) { throw new Error("Invalid state - a tokenization is missing expected tokens"); } - tokensToRemove.push(tokenSetA.pop()); - tokensToPredict.push(tokenSetB.pop()); + tokensToRemove.unshift(tokenSetA[bHeadIndexInA + tailOffset - 1]); + tokensToPredict.unshift(tokenSetB[aHeadIndexInB + tailOffset - 1]); } - tokensToRemove.reverse(); - return { tokensToRemove, - tokensToPredict + tokensToPredict, + deleteLeft: deleteLeftCalc(tokensToRemove, tokensToPredict.length) } } diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts index 3445abaadd9..a6b5bcfaeb7 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts @@ -107,7 +107,10 @@ function buildQuickBrownFixture() { null ); + const deleteLeftCalc = (tokens: ContextToken[]) => tokens.reduce((accum, curr) => accum + curr.codepointLength, 0); + return { + deleteLeftCalc, baseTokenization, variations: { noChange: { @@ -115,7 +118,8 @@ function buildQuickBrownFixture() { tokenization: baseTokenization, range: { tokensToRemove: [baseTokenization.tail], - tokensToPredict: [baseTokenization.tail] + tokensToPredict: [baseTokenization.tail], + deleteLeft: deleteLeftCalc([baseTokenization.tail]) } }, plainInsert: { @@ -123,7 +127,8 @@ function buildQuickBrownFixture() { tokenization: plainInsertTokenization, range: { tokensToRemove: [baseTokenization.tail], - tokensToPredict: [plainInsertTokenization.tail] + tokensToPredict: [plainInsertTokenization.tail], + deleteLeft: deleteLeftCalc([baseTokenization.tail]) } }, newTokenInsert: { @@ -131,7 +136,8 @@ function buildQuickBrownFixture() { tokenization: newTokenInsertTokenization, range: { tokensToRemove: [] as ContextToken[], - tokensToPredict: [newTokenInsertTokenization.tail] + tokensToPredict: [newTokenInsertTokenization.tail], + deleteLeft: deleteLeftCalc([]) } }, charReplace: { @@ -139,7 +145,8 @@ function buildQuickBrownFixture() { tokenization: charReplaceTokenization, range: { tokensToRemove: [baseTokenization.tail], - tokensToPredict: [charReplaceTokenization.tail] + tokensToPredict: [charReplaceTokenization.tail], + deleteLeft: deleteLeftCalc([baseTokenization.tail]) } }, eraseToken: { @@ -147,7 +154,8 @@ function buildQuickBrownFixture() { tokenization: eraseTokenTokenization, range: { tokensToRemove: [baseTokenization.tail], - tokensToPredict: [eraseTokenTokenization.tail] + tokensToPredict: [eraseTokenTokenization.tail], + deleteLeft: deleteLeftCalc([baseTokenization.tail]) } }, del5Insert5: { @@ -155,7 +163,8 @@ function buildQuickBrownFixture() { tokenization: del5Insert5Tokenization, range: { tokensToRemove: baseTokenization.tokens.slice(baseTokenCount-3), - tokensToPredict: [del5Insert5Tokenization.tail] + tokensToPredict: [del5Insert5Tokenization.tail], + deleteLeft: deleteLeftCalc(baseTokenization.tokens.slice(baseTokenCount-3)) } }, deleteToBound: { @@ -163,86 +172,96 @@ function buildQuickBrownFixture() { tokenization: deleteToBoundTokenization, range: { tokensToRemove: baseTokenization.tokens.slice(baseTokenCount-3), - tokensToPredict: [deleteToBoundTokenization.tail] + tokensToPredict: [deleteToBoundTokenization.tail], + deleteLeft: deleteLeftCalc(baseTokenization.tokens.slice(baseTokenCount-3)) } } } }; } +const tokenEquality = (a: ContextToken, b: ContextToken) => a.spaceId == b.spaceId; + describe('determineSuggestionRange', () => { it('adjusts the final token if no tokenization changes occur', () => { const fixture = buildQuickBrownFixture(); const noChange = fixture.variations.noChange; - const analysis = determineSuggestionRange(fixture.baseTokenization, noChange.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, noChange.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, noChange.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, noChange.range.tokensToPredict); + assert.equal(analysis.deleteLeft, noChange.range.deleteLeft); }); it('adjusts the final token after a simple same-token insert', () => { const fixture = buildQuickBrownFixture(); const plainInsert = fixture.variations.plainInsert; - const analysis = determineSuggestionRange(fixture.baseTokenization, plainInsert.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, plainInsert.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, plainInsert.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, plainInsert.range.tokensToPredict); + assert.equal(analysis.deleteLeft, plainInsert.range.deleteLeft); }); it('adjusts the final token after a simple word-breaking insert', () => { const fixture = buildQuickBrownFixture(); const newTokenInsert = fixture.variations.newTokenInsert; - const analysis = determineSuggestionRange(fixture.baseTokenization, newTokenInsert.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, newTokenInsert.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, newTokenInsert.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, newTokenInsert.range.tokensToPredict); + assert.equal(analysis.deleteLeft, newTokenInsert.range.deleteLeft); }); it('adjusts the final token after a simple same-token character replacement', () => { const fixture = buildQuickBrownFixture(); const charReplace = fixture.variations.charReplace; - const analysis = determineSuggestionRange(fixture.baseTokenization, charReplace.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, charReplace.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, charReplace.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, charReplace.range.tokensToPredict); + assert.equal(analysis.deleteLeft, charReplace.range.deleteLeft); }); it('handles deletion of two tokens + alteration of the token before', () => { const fixture = buildQuickBrownFixture(); const del5Insert5 = fixture.variations.del5Insert5; - const analysis = determineSuggestionRange(fixture.baseTokenization, del5Insert5.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, del5Insert5.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, del5Insert5.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, del5Insert5.range.tokensToPredict); + assert.equal(analysis.deleteLeft, del5Insert5.range.deleteLeft); }); it('handles deletion of chars up to closest whitespace', () => { const fixture = buildQuickBrownFixture(); const eraseToken = fixture.variations.eraseToken; - const analysis = determineSuggestionRange(fixture.baseTokenization, eraseToken.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, eraseToken.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, eraseToken.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, eraseToken.range.tokensToPredict); + assert.equal(analysis.deleteLeft, eraseToken.range.deleteLeft); }); it('handles deletion up to boundary of ancestor non-whitespace token', () => { const fixture = buildQuickBrownFixture(); const deleteToBound = fixture.variations.deleteToBound; - const analysis = determineSuggestionRange(fixture.baseTokenization, deleteToBound.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, deleteToBound.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, deleteToBound.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, deleteToBound.range.tokensToPredict); + assert.equal(analysis.deleteLeft, deleteToBound.range.deleteLeft); }); it('handles large variation in intermediate tokens', () => { - const originalQuickBrownTokenization = buildQuickBrownFixture().baseTokenization; + const { deleteLeftCalc, baseTokenization: originalQuickBrownTokenization } = buildQuickBrownFixture(); const rawText = ['beyond', ' ', 'the', ' ', 'hungry', ' ', 'green', ' ', 'alligator']; // the quick brown fox jumped | // Final whitespace is immediately before index 10. @@ -255,7 +274,7 @@ describe('determineSuggestionRange', () => { null ) - const analysis = determineSuggestionRange(originalQuickBrownTokenization, foxVsAlligatorTokenization); + const analysis = determineSuggestionRange(originalQuickBrownTokenization.tokens, foxVsAlligatorTokenization.tokens, tokenEquality); assert.sameOrderedMembers( analysis.tokensToRemove, @@ -265,10 +284,15 @@ describe('determineSuggestionRange', () => { analysis.tokensToPredict, tokensToAppend ); + + // TODO: Once we allow multiple tokens to contribute to deleteLeft, replace + // RHS with + // originalQuickBrownTokenization.tokens.slice(transitionSliceIndex). + assert.equal(analysis.deleteLeft, deleteLeftCalc([originalQuickBrownTokenization.tail])); }); it('handles insertion of many extra new tokens at once', () => { - const originalQuickBrownTokenization = buildQuickBrownFixture().baseTokenization; + const { deleteLeftCalc, baseTokenization: originalQuickBrownTokenization } = buildQuickBrownFixture(); const originalTokenCount = originalQuickBrownTokenization.tokens.length; const rawText = ['dogs', ' ', 'and', ' ', 'the', ' ', 'sleeping', ' ', 'cat']; const tokensToAppend = rawText.map((t) => ContextToken.fromRawText(plainModel, t, false)); @@ -279,7 +303,7 @@ describe('determineSuggestionRange', () => { null ) - const analysis = determineSuggestionRange(originalQuickBrownTokenization, dogsAndCatTokenization); + const analysis = determineSuggestionRange(originalQuickBrownTokenization.tokens, dogsAndCatTokenization.tokens, tokenEquality); assert.sameOrderedMembers( analysis.tokensToRemove, @@ -289,5 +313,7 @@ describe('determineSuggestionRange', () => { analysis.tokensToPredict, tokensToAppend ); + + assert.equal(analysis.deleteLeft, deleteLeftCalc([originalQuickBrownTokenization.tail])); }); }); \ No newline at end of file