Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -108,18 +108,6 @@ export class ContextTokenization {
*/
readonly tokens: ContextToken[];

/**
* Denotes whether or not the transition to this tokenization added or deleted
* any tokens.
*/
readonly transitionEdits?: {
addedNewTokens: boolean,
removedOldTokens: boolean,
// NOTE: slated for removal in an upcoming PR. Exists in this form to
// facilitate factorization of the changes into smaller bodies of work.
editedTokenCount: number
};

/**
* The portion of edits from the true input keystroke that are not part of the
* final entry in `token`. If `null`, all edits are considered part of the
Expand Down Expand Up @@ -147,18 +135,10 @@ export class ContextTokenization {
throw new Error("ContextTokenization requires at least one existing ContextToken");
}
this.tokens = [].concat(tokens);
if(tokenizationPath) {
this.transitionEdits = {
addedNewTokens: tokenizationPath?.inputs[0].sample.has(1) ?? false,
removedOldTokens: (tokenizationPath?.alignment.removedTokenCount ?? 0) > 0,
editedTokenCount: tokenizationPath?.inputs[0].sample.size
}
}
this.taillessTrueKeystroke = taillessTrueKeystroke;
} else {
const priorToClone = param1;
this.tokens = priorToClone.tokens.map((entry) => new ContextToken(entry));
this.transitionEdits = priorToClone.transitionEdits ? {...priorToClone.transitionEdits} : null;
this.taillessTrueKeystroke = priorToClone.taillessTrueKeystroke;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ describe('ContextState', () => {
assert.isNotNull(newContextMatch?.final);
assert.deepEqual(newContextMatch.final.displayTokenization.tokens.map(token => token.exampleInput), rawTokens);
// We want to preserve the added whitespace when predicting a token that follows after it.

assert.deepEqual(newContextMatch.final.displayTokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });

// The 'wordbreak' transform
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import {
ExtendedEditOperation,
generateSubsetId,
models,
TransitionEdge,
SearchQuotientSpur,
traceInsertEdits,
LegacyQuotientSpur
Expand Down Expand Up @@ -100,72 +99,26 @@ describe('ContextTokenization', function() {
let tokenization = new ContextTokenization(rawTextTokens.map((text => toToken(text))));
assert.deepEqual(tokenization.tokens.map((entry) => entry.exampleInput), rawTextTokens);
assert.deepEqual(tokenization.tokens.map((entry) => entry.isWhitespace), rawTextTokens.map((entry) => entry == ' '));
assert.isNotOk(tokenization.transitionEdits);
assert.equal(tokenization.tail.exampleInput, 'day');
assert.isFalse(tokenization.tail.isWhitespace);
});

it("constructs from a token array + alignment data", () => {
const rawTextTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'];
const tokens = rawTextTokens.map((text => toTransformToken(text)));
const emptyTransform = { insert: '', deleteLeft: 0, deleteRight: 0 };

// We _could_ flesh this out a bit more... but it's not really needed for this test.
const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec);
let transitionEdits: TransitionEdge = {
alignment: {
merges: [],
splits: [],
unmappedEdits: [],
edgeWindow: {...edgeWindow, retokenization: rawTextTokens.slice(edgeWindow.sliceIndex)},
removedTokenCount: 0
},
inputs: [{sample: (() => {
const map = new Map<number, Transform>();
map.set(0, emptyTransform);
return map;
})(), p: 1}],
inputSubsetId: generateSubsetId()
};

let tokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */);
let tokenization = new ContextTokenization(tokens, null, null /* dummy val */);

assert.deepEqual(tokenization.tokens.map((entry) => entry.exampleInput), rawTextTokens);
assert.deepEqual(tokenization.tokens.map((entry) => entry.isWhitespace), rawTextTokens.map((entry) => entry == ' '));
assert.isOk(tokenization.transitionEdits);
assert.deepEqual(tokenization.transitionEdits, {
addedNewTokens: false,
removedOldTokens: false,
editedTokenCount: 1
});
assert.equal(tokenization.tail.exampleInput, 'day');
assert.isFalse(tokenization.tail.isWhitespace);
});

it('clones', () => {
const rawTextTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'];
const tokens = rawTextTokens.map((text => toTransformToken(text)));
const emptyTransform = { insert: '', deleteLeft: 0, deleteRight: 0 };

// We _could_ flesh this out a bit more... but it's not really needed for this test.
const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec);
let transitionEdits: TransitionEdge = {
alignment: {
merges: [],
splits: [],
unmappedEdits: [],
edgeWindow: {...edgeWindow, retokenization: rawTextTokens.slice(edgeWindow.sliceIndex)},
removedTokenCount: 0
},
inputs: [{sample: (() => {
const map = new Map<number, Transform>();
map.set(0, emptyTransform);
return map;
})(), p: 1}],
inputSubsetId: generateSubsetId()
};

let baseTokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */);
let baseTokenization = new ContextTokenization(tokens, null, null /* dummy val */);
let cloned = new ContextTokenization(baseTokenization);

assert.sameOrderedMembers(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ function assertMatchingToken(actual: ContextToken, expected: ContextToken, msg:
function assertMatchingTokenization(actual: ContextTokenization, expected: ContextTokenization, msg: string) {
assert.equal(actual.tokens.length, expected.tokens.length, msg);
assert.deepEqual(actual.exampleInput, expected.exampleInput, msg);
assert.deepEqual(actual.transitionEdits, expected.transitionEdits, msg);

for(let j=0; j < actual.tokens.length; j++) {
const nestedMsg = `${msg}, token ${j}`;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ describe('determineContextTransition', () => {
assert.equal(transition, tracker.latest);
assert.isFalse(warningEmitterSpy.called);
assert.sameOrderedMembers(transition.final.displayTokenization.exampleInput, ['this', ' ', 'is', ' ', 'for', ' ', 'techn']);
assert.isOk(transition.final.displayTokenization.transitionEdits);
assert.equal(transition.final.context.left, targetContext.left);
assert.equal(transition.final.context.right ?? "", targetContext.right ?? "");
assert.sameDeepOrderedMembers(transition.inputDistribution, inputDistribution);
Expand Down
Loading