We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents d00da38 + b57599c commit f0afe9eCopy full SHA for f0afe9e
2 files changed
src/tokenizer/byte.rs
@@ -0,0 +1,31 @@
1
+use super::Tokenizer;
2
+use std::collections::{HashMap, HashSet};
3
+
4
+pub struct ByteTokenizer;
5
6
+impl ByteTokenizer {
7
+ pub fn new() -> Self {
8
+ ByteTokenizer
9
+ }
10
+}
11
12
+impl Tokenizer for ByteTokenizer {
13
+ fn vocab_size(&self) -> usize {
14
+ 256
15
16
+ fn tokenize(&self, string: &str) -> Vec<usize> {
17
+ string
18
+ .as_bytes()
19
+ .iter()
20
+ .map(|b| *b as usize)
21
+ .collect()
22
23
+ fn untokenize(&self, tokens: &[usize]) -> String {
24
+ String::from_utf8_lossy(
25
+ &tokens
26
27
+ .map(|b| *b as u8)
28
+ .collect::<Vec<u8>>())
29
+ .to_string()
30
31
src/tokenizer/mod.rs
@@ -1,3 +1,6 @@
+mod byte;
+pub use byte::*;
mod simple;
pub use simple::*;
0 commit comments