Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -108,38 +108,21 @@ export class ContextTokenization {
*/
readonly tokens: ContextToken[];

/**
* The portion of edits from the true input keystroke that are not part of the
* final entry in `token`. If `null`, all edits are considered part of the
* final token's contents.
*
* If the final token is new due to a newly-introduced wordboundary traversed
* by the keystroke, this will generally be set to an empty transform that
* 'finalizes' the previous tail token.
*
* (Refer to #12494 for an example case.)
*/
readonly taillessTrueKeystroke: Transform;

constructor(priorToClone: ContextTokenization);
constructor(tokens: ContextToken[]);
constructor(tokens: ContextToken[], alignment: TransitionEdge, taillessTrueKeystroke: Transform);
constructor(tokens: ContextToken[], alignment: TransitionEdge);
constructor(
param1: ContextToken[] | ContextTokenization,
tokenizationPath?: TransitionEdge,
taillessTrueKeystroke?: Transform
param1: ContextToken[] | ContextTokenization
) {
if(!(param1 instanceof ContextTokenization)) {
const tokens = param1;
if(!tokens || tokens.length == 0) {
throw new Error("ContextTokenization requires at least one existing ContextToken");
}
this.tokens = [].concat(tokens);
this.taillessTrueKeystroke = taillessTrueKeystroke;
} else {
const priorToClone = param1;
this.tokens = priorToClone.tokens.map((entry) => new ContextToken(entry));
this.taillessTrueKeystroke = priorToClone.taillessTrueKeystroke;
}
}

Expand Down Expand Up @@ -391,7 +374,7 @@ export class ContextTokenization {
tokenization.push(token);
}

return new ContextTokenization(this.tokens.slice(0, sliceIndex).concat(tokenization), null, this.taillessTrueKeystroke);
return new ContextTokenization(this.tokens.slice(0, sliceIndex).concat(tokenization));
}

/**
Expand Down Expand Up @@ -486,11 +469,7 @@ export class ContextTokenization {
affectedToken = null;
}

return new ContextTokenization(
this.tokens.slice(0, sliceIndex).concat(tailTokenization),
null,
determineTaillessTrueKeystroke(transitionEdge)
);
return new ContextTokenization(this.tokens.slice(0, sliceIndex).concat(tailTokenization));
}
}

Expand Down Expand Up @@ -1175,59 +1154,4 @@ export function assembleTransforms(stackedInserts: string[], stackedDeletes: num
}

return transformMap;
}

/**
* Used to construct and represent the part of the incoming transform that does
* not land as part of the final token in the resulting context. This component
* should be preserved by any suggestions that get applied.
* @param tokenizationAnalysis
* @returns
*/
export function determineTaillessTrueKeystroke(tokenizationAnalysis: TransitionEdge) {
// undefined by default; we haven't yet determined if we're still affecting
// the same token that was the tail in the previous tokenization state.
let taillessTrueKeystroke: Transform;

// If tokens were inserted, emit an empty transform; this prevents
// suggestions from replacing the "current" token.
const bestTokenizedInput = tokenizationAnalysis.inputs[0].sample;
if(bestTokenizedInput.has(1)) {
// Sets a default transform that will be returned even if the main
// transform body lies entirely within a new token.
taillessTrueKeystroke = { insert: '', deleteLeft: 0 };

// While the .size() > 1 case could also land here, it is ALSO covered
// by the loop that follows, without fail.
}

const transformKeys = [...tokenizationAnalysis.inputs[0].sample.keys()];
transformKeys.pop();

for(let i of transformKeys) {
/*
* Thinking ahead to multitokenization:
*
* If what we have is not on the "true" tokenization, then... we need to
* do multitoken effects, right? We're basing new suggestions based on a
* state that does not currently exist! We'd need to enforce THAT state,
* *then* do the suggestion!
* - Which gets fun if we auto-apply such a case, as the new "true" tokenization
* no longer results directly from the true input.
*
* If we give tokens unique IDs on first creation, we could backtrace to
* find the most recent common ancestor.
* - simple cases (same 'token', but different input transform lengths/effects)
* will have the same prior token ID
*/
const primaryInput = tokenizationAnalysis.inputs[0].sample.get(i);
if(!taillessTrueKeystroke) {
taillessTrueKeystroke = {...primaryInput};
} else {
taillessTrueKeystroke.insert += primaryInput.insert;
taillessTrueKeystroke.deleteLeft += primaryInput.deleteLeft;
}
}

return taillessTrueKeystroke;
}
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ export function transitionTokenizations(

// Following call: is actually designed to build SubstitutionQuotientSpurs.
const transitionedTokenization = rootTokenization.evaluateTransition(precomp[1], trueInput.id, bestProb);
const remadeTokenization = new ContextTokenization(transitionedTokenization.tokens, subset.transitionEdges.get(rootTokenization), transitionedTokenization.taillessTrueKeystroke);
const remadeTokenization = new ContextTokenization(transitionedTokenization.tokens, subset.transitionEdges.get(rootTokenization));

// If the last token is empty and has no flag for a revertable transition,
// attempt to copy the previous token's revertable transition flag.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { searchForProperty, WordBreakProperty } from '@keymanapp/models-wordbrea
import { TransformUtils } from './transformUtils.js';
import { detectCurrentCasing, determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
import { ContextTokenLike } from './correction/context-token.js';
import { ContextTokenization, mapWhitespacedTokenization } from './correction/context-tokenization.js';
import { ContextTokenization } from './correction/context-tokenization.js';
import { ContextTracker } from './correction/context-tracker.js';
import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
import { ContextTransition } from './correction/context-transition.js';
Expand Down Expand Up @@ -181,12 +181,6 @@ export interface PredictionMetadata {
* available upon initial construction of this type.
*/
matchLevel?: SuggestionSimilarity;

/**
* Text from the triggering input that should _not_ be affected by the
* prediction.
*/
preservationTransform?: Transform;
}

export interface IntermediateTokenizedPrediction {
Expand Down Expand Up @@ -312,21 +306,9 @@ export function determineTraversallessCorrectionSequences(
suggestionParams.tokens.forEach((token) => token.correction.sample.id = transformId);
}

const tokenizationMapping = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample);
const tokenizedCorrection = tokenizationMapping.tokenizedTransform;
const tokenizedCorrectionEntries = [...tokenizedCorrection.values()];

// IF: array has multiple entries, then build the preservation-transform as below, including the deleteLeft.
// If not, don't make one!
const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).reduce((accum, curr) => {
return { insert: accum.insert + curr.insert, deleteLeft: accum.deleteLeft + curr.deleteLeft };
}, { insert: '', deleteLeft: 0, id: correction.sample.id});

returnedPredictionData.push({
...suggestionParams,
applyInPost: (p) => {
p.metadata.preservationTransform = preservationTransform;
}
applyInPost: (p) => {}
})
}

Expand Down Expand Up @@ -616,16 +598,9 @@ export function determineTokenizedCorrectionSequence(
suggestionParams.tokens.forEach((t) => t.correction.sample.id = transition.transitionId);
}

const { deleteLeft } = transitionParams;

return {
...suggestionParams,
applyInPost: (entry: IntermediateTokenizedPrediction) => {
entry.metadata.preservationTransform = tokenization.taillessTrueKeystroke;
// // Will need an extra lookup layer if the suggestion is generated from within a cluster.
// entry.baseTokenization = transition.final.tokenizationSourceMap.get(tokenization);
entry.components[0].prediction.transform.deleteLeft = deleteLeft;
}
applyInPost: (entry) => {}
};
}

Expand Down Expand Up @@ -1328,24 +1303,6 @@ export function finalizeSuggestions(
const suggestions = deduplicatedSuggestionTuples.map((tuple) => {
const prediction = tuple.components.prediction;

// If this is a suggestion after any form of wordbreak input, make sure we preserve any components
// from prior tokens!
//
// Note: may need adjustment if/when supporting phrase-level correction.
if(tuple.metadata.preservationTransform) {
const mergedTransform = {
...models.buildMergedTransform(tuple.metadata.preservationTransform, {...prediction.transform, deleteLeft: 0}),
deleteLeft: prediction.transform.deleteLeft
};

// Temporarily and locally drops 'readonly' semantics so that we can reassign the transform.
// See https://www.typescriptlang.org/docs/handbook/release-notes/typescript-2-8.html#improved-control-over-mapped-type-modifiers
let mutableSuggestion = prediction as {-readonly [transform in keyof Suggestion]: Suggestion[transform]};

// Assignment via by-reference behavior, as suggestion is an object
mutableSuggestion.transform = mergedTransform;
}

// Is sometimes not set during unit tests.
if(prediction.transformId !== undefined) {
prediction.transform.id = prediction.transformId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,6 @@ describe('ContextState', () => {
let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform));
assert.isNotNull(newContextMatch?.final);
assert.deepEqual(newContextMatch.final.displayTokenization.tokens.map(token => token.exampleInput), rawTokens);
// We want to preserve the added whitespace when predicting a token that follows after it.

assert.deepEqual(newContextMatch.final.displayTokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });

// The 'wordbreak' transform
let state = newContextMatch?.final;
Expand All @@ -275,8 +272,6 @@ describe('ContextState', () => {
let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform));
assert.isNotNull(newContextMatch?.final);
assert.deepEqual(newContextMatch.final.displayTokenization.tokens.map(token => token.exampleInput), rawTokens);
// We want to preserve the added whitespace when predicting a token that follows after it.
assert.deepEqual(newContextMatch.final.displayTokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });

// The 'wordbreak' transform
let state = newContextMatch?.final;
Expand Down Expand Up @@ -319,7 +314,6 @@ describe('ContextState', () => {
let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform));
assert.isNotNull(newContextMatch?.final);
assert.deepEqual(newContextMatch.final.displayTokenization.tokens.map(token => token.exampleInput), rawTokens);
assert.deepEqual(newContextMatch.final.displayTokenization.taillessTrueKeystroke, { insert: '', deleteLeft: 0 });

// The 'wordbreak' transform
let state = newContextMatch.final;
Expand All @@ -345,8 +339,6 @@ describe('ContextState', () => {
let newContextMatch = baseState.analyzeTransition(newContext, toWrapperDistribution(transform));
assert.isNotNull(newContextMatch?.final);
assert.deepEqual(newContextMatch.final.displayTokenization.tokens.map(token => token.exampleInput), rawTokens);
// We want to preserve the added whitespace when predicting a token that follows after it.
assert.deepEqual(newContextMatch.final.displayTokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });

// The 'wordbreak' transform
let state = newContextMatch.final;
Expand Down Expand Up @@ -376,8 +368,6 @@ describe('ContextState', () => {
let newContextMatch = baseState.analyzeTransition(existingContext, [{sample: transform, p: 1}]);
assert.isNotNull(newContextMatch?.final);
assert.deepEqual(newContextMatch.final.displayTokenization.tokens.map(token => token.exampleInput), rawTokens);
// We want to preserve all text preceding the new token when applying a suggestion.
assert.deepEqual(newContextMatch.final.displayTokenization.taillessTrueKeystroke, { insert: 'd ', deleteLeft: 0});

// The 'wordbreak' transform
let state = newContextMatch.final;
Expand All @@ -401,8 +391,6 @@ describe('ContextState', () => {
let newContextMatch = baseState.analyzeTransition(existingContext, [{sample: transform, p: 1}]);
assert.isNotNull(newContextMatch?.final);
assert.deepEqual(newContextMatch.final.displayTokenization.tokens.map(token => token.exampleInput), rawTokens);
// We want to preserve all text preceding the new token when applying a suggestion.
assert.deepEqual(newContextMatch.final.displayTokenization.taillessTrueKeystroke, { insert: 'tor ', deleteLeft: 0 });

// The 'wordbreak' transform
let state = newContextMatch.final;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ describe('ContextTokenization', function() {
const rawTextTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'];
const tokens = rawTextTokens.map((text => toTransformToken(text)));

let tokenization = new ContextTokenization(tokens, null, null /* dummy val */);
let tokenization = new ContextTokenization(tokens);

assert.deepEqual(tokenization.tokens.map((entry) => entry.exampleInput), rawTextTokens);
assert.deepEqual(tokenization.tokens.map((entry) => entry.isWhitespace), rawTextTokens.map((entry) => entry == ' '));
Expand All @@ -118,7 +118,7 @@ describe('ContextTokenization', function() {
it('clones', () => {
const rawTextTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'];
const tokens = rawTextTokens.map((text => toTransformToken(text)));
let baseTokenization = new ContextTokenization(tokens, null, null /* dummy val */);
let baseTokenization = new ContextTokenization(tokens);
let cloned = new ContextTokenization(baseTokenization);

assert.sameOrderedMembers(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,7 @@ function generateFixtureForTokenizationOutboundTransition (

// CURRENTLY NOT DONE: adding new or replacement tokens for text to be placed after 'quotientNodeToExtend'.

const transitionedTokenization = new ContextTokenization(
srcTokenization.tokens.slice(0, srcTokenization.tokens.length - 1 + relativeTailIndex).concat(token),
tokenizationEdge,
null
);
const transitionedTokenization = new ContextTokenization(srcTokenization.tokens.slice(0, srcTokenization.tokens.length - 1 + relativeTailIndex).concat(token));

return {
/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ describe('TokenizationCorrector', () => {
p: 1
}
const therefxyz = new ContextToken(new SubstitutionQuotientSpur(therefxy, [zInput], zInput));
const therefxyzTokenization = new ContextTokenization([therefxyz], null, null);
const therefxyzTokenization = new ContextTokenization([therefxyz]);

const instance = new TokenizationCorrector(
therefxyzTokenization,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ describe('determineContextTransition', () => {
assert.equal(transition.final.context.left, targetContext.left);
assert.equal(transition.final.context.right ?? "", targetContext.right ?? "");
assert.sameDeepOrderedMembers(transition.inputDistribution, inputDistribution);
assert.isNotOk(transition.final.displayTokenization.taillessTrueKeystroke);
assert.equal(transition.transitionId, 1);
} finally {
warningEmitterSpy.restore();
Expand Down
Loading
Loading