Skip to content

Commit

Permalink
Merge pull request #17 from coreweave/rahul/update_tokenizer
Browse files Browse the repository at this point in the history
feat(tokenizer): Add Mistral and Llama support with backwards compatibility
  • Loading branch information
rtalaricw authored May 20, 2024
2 parents edd9879 + cbfb5c2 commit bef6e0d
Show file tree
Hide file tree
Showing 16 changed files with 91,731 additions and 115 deletions.
64 changes: 64 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib

# Test binary, coverage reports, and build directories
*.test
*.out
*.coverprofile
*.cover
*.cov

# Logs
*.log

# OS generated files
.DS_Store
Thumbs.db

# Temporary files
*.tmp
*.temp

# IDE and Editor specific files
.vscode/
.idea/
*.swp
*~
*.swo

# Dependency directories and files
vendor/
Gopkg.lock
Gopkg.toml
go.sum

# Build directories
bin/
obj/
pkg/

# Test directories
Test*/

# IDE specific project files and directories
*.iml
*.ipr
*.iws

# Code coverage tool output
*.prof
coverage.txt

# Go workspace and tools
go.work
go.work.sum

# Exclude Go module download cache
/.go/

# Exclude VS Code Go extension settings
.vscode/go.*.json
6 changes: 5 additions & 1 deletion cmd/dataset_tokenizer/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ replace github.com/wbrown/gpt_bpe => ../../
require (
github.com/aws/aws-sdk-go v1.45.4
github.com/stretchr/testify v1.7.1
github.com/wbrown/gpt_bpe v0.0.0-00010101000000-000000000000
github.com/wbrown/gpt_bpe v0.0.0-20240410161531-edd9879e0496
github.com/yargevad/filepathx v1.0.0
)

Expand All @@ -16,14 +16,18 @@ require (
github.com/deckarep/golang-set v1.8.0 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/edsrzf/mmap-go v1.1.0 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/jdkato/prose/v2 v2.0.0 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/mingrammer/commonregex v1.0.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/vikesh-raj/go-sentencepiece-encoder v1.1.1 // indirect
golang.org/x/sys v0.3.0 // indirect
golang.org/x/text v0.4.0 // indirect
gonum.org/v1/gonum v0.12.0 // indirect
google.golang.org/protobuf v1.26.0 // indirect
gopkg.in/neurosnap/sentences.v1 v1.0.7 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
)
3 changes: 3 additions & 0 deletions cmd/dataset_tokenizer/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw
github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.1/go.mod h1:DopwsBzvsk0Fs44TXzsVbJyPhcCPeIwnvohx4u74HPM=
github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
Expand Down Expand Up @@ -287,6 +288,7 @@ github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMT
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
github.com/urfave/cli v1.22.4/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
github.com/vikesh-raj/go-sentencepiece-encoder v1.1.1 h1:q5Rm4ihhwmAiDycaL8rNiE/ly4on+nHQajElYLPN7TM=
github.com/vikesh-raj/go-sentencepiece-encoder v1.1.1/go.mod h1:GlANpY4lgPZT+cpb0pkEJrTMbICKc74KleEZwEiGqmU=
github.com/yargevad/filepathx v1.0.0 h1:SYcT+N3tYGi+NvazubCNlvgIPbzAk7i7y2dwg3I5FYc=
github.com/yargevad/filepathx v1.0.0/go.mod h1:BprfX/gpYNJHJfc35GjRRpVcwWXS89gGulUIU5tK3tA=
Expand Down Expand Up @@ -683,6 +685,7 @@ google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpAD
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0 h1:bxAC2xTBsZGibn2RTntX0oH50xLsqy1OxA9tTL3p/lk=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
Expand Down
2 changes: 1 addition & 1 deletion cmd/tokenizer_repl/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ require github.com/wbrown/gpt_bpe v0.0.0-20230503154557-854424f32a83

require (
github.com/deckarep/golang-set v1.8.0 // indirect
github.com/dustin/go-humanize v1.0.0 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/edsrzf/mmap-go v1.1.0 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/gopherjs/gopherjs v1.17.2 // indirect
Expand Down
25 changes: 24 additions & 1 deletion gpt_bpe.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ const VOCAB_ID_CLIP = "clip-tokenizer"
const VOCAB_ID_NERDSTASH_V1 = "nerdstash_v1-tokenizer"
const VOCAB_ID_NERDSTASH_V2 = "nerdstash_v2-tokenizer"
const VOCAB_ID_LLAMA = "llama-tokenizer"
const VOCAB_ID_MISTRAL = "mistral-tokenizer"

func NewGPT2Encoder() GPTEncoder {
encoder, _ := NewEncoder(VOCAB_ID_GPT2)
Expand Down Expand Up @@ -136,6 +137,11 @@ func NewLlama2Encoder() GPTEncoder {
return *encoder
}

func NewMistralEncoder() GPTEncoder {
encoder, _ := NewEncoder(VOCAB_ID_MISTRAL)
return *encoder
}

// NewEncoder
// Returns a GPTEncoder with the tokenizer data loaded for that vocabulary
// id.
Expand Down Expand Up @@ -174,11 +180,23 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) {
AddEosToken: false,
PadToken: "",
}
altMistralSpecialsConfig := resources.MistralSpecialsConfig{
AddBosToken: false,
AddEosToken: false,
PadToken: "",
}
if special, ok := (rsrcs)["tokenizer_config.json"]; ok {
if special.Data != nil {
err := json.Unmarshal(*special.Data, &tokenizerSpecialConfig)
if err != nil {
log.Fatal("Error unmarshalling tokenizer_config.json: ", err)
err = json.Unmarshal(*special.Data, &altMistralSpecialsConfig)
if err != nil {
log.Fatal("Error unmarshalling tokenizer_config.json")
}
//populate the tokenizerSpecialConfig from the altMistralSpecialsConfig
tokenizerSpecialConfig.AddBosToken = altMistralSpecialsConfig.AddBosToken
tokenizerSpecialConfig.AddEosToken = altMistralSpecialsConfig.AddEosToken
tokenizerSpecialConfig.PadToken = altMistralSpecialsConfig.PadToken
}
}
}
Expand Down Expand Up @@ -1018,6 +1036,11 @@ func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Toke
if encoder.encloseEosBos || encoder.encloseBos {
accumulator = append(accumulator, encoder.BosToken)
}
// Temporary hack - inject a space token at the end of the accumulator for mistral-tokenizer
if encoder.VocabId == "mistral-tokenizer" {
accumulator = append(accumulator, encoder.Encoder[" "])
}

return func(desiredTokens int) *Tokens {
for {
// If we have enough tokens, then we return them, and reset the
Expand Down
Loading

0 comments on commit bef6e0d

Please sign in to comment.