Skip to content

Commit f0afe9e

Browse files
authored
Merge pull request #27 from baehyunsol/byte_tokenizer
2 parents d00da38 + b57599c commit f0afe9e

2 files changed

Lines changed: 34 additions & 0 deletions

File tree

src/tokenizer/byte.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
use super::Tokenizer;
2+
use std::collections::{HashMap, HashSet};
3+
4+
pub struct ByteTokenizer;
5+
6+
impl ByteTokenizer {
7+
pub fn new() -> Self {
8+
ByteTokenizer
9+
}
10+
}
11+
12+
impl Tokenizer for ByteTokenizer {
13+
fn vocab_size(&self) -> usize {
14+
256
15+
}
16+
fn tokenize(&self, string: &str) -> Vec<usize> {
17+
string
18+
.as_bytes()
19+
.iter()
20+
.map(|b| *b as usize)
21+
.collect()
22+
}
23+
fn untokenize(&self, tokens: &[usize]) -> String {
24+
String::from_utf8_lossy(
25+
&tokens
26+
.iter()
27+
.map(|b| *b as u8)
28+
.collect::<Vec<u8>>())
29+
.to_string()
30+
}
31+
}

src/tokenizer/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
mod byte;
2+
pub use byte::*;
3+
14
mod simple;
25
pub use simple::*;
36

0 commit comments

Comments
 (0)