Skip to content
This repository has been archived by the owner on Oct 16, 2018. It is now read-only.

[index] Add Batch method for inserting multiple documents at a time #34

Merged
merged 8 commits into from
Apr 26, 2018
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 19 additions & 20 deletions doc/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ import (
)

var (
errMultipleIDs = errors.New("document cannot contain multiple IDs")
errZeroLengthID = errors.New("document ID cannot be of length zero")
errEmptyDocument = errors.New("document cannot be empty")
errReservedFieldName = fmt.Errorf("'%s' is a reserved field name", IDReservedFieldName)
errEmptyDocument = errors.New("document cannot be empty")
)

// IDReservedFieldName is the field name reserved for IDs.
Expand Down Expand Up @@ -89,7 +88,7 @@ func (f Fields) shallowCopy() Fields {

// Document represents a document to be indexed.
type Document struct {
// Fields contains the list of fields by which to index the document.
ID []byte
Fields []Field
}

Expand All @@ -105,6 +104,10 @@ func (d Document) Get(fieldName []byte) ([]byte, bool) {

// Equal returns whether this document is equal to another.
func (d Document) Equal(other Document) bool {
if !bytes.Equal(d.ID, other.ID) {
return false
}

if len(d.Fields) != len(other.Fields) {
return false
}
Expand Down Expand Up @@ -133,35 +136,31 @@ func (d Document) Equal(other Document) bool {
return true
}

// Validate validates the given document and returns its ID if it has one.
func (d Document) Validate() ([]byte, error) {
// Validate returns a bool indicating whether the document is valid.
func (d Document) Validate() error {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this call HasID()? maybe add a test for this case too

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to require an ID here, in fact, in the segment we currently check if a document is valid before checking its ID. We might want to remove the check for the fields though in case we ever want to index just an ID. Not sure if that would ever be needed though.

if len(d.Fields) == 0 {
return nil, errEmptyDocument
return errEmptyDocument
}

var id []byte
for _, f := range d.Fields {
// TODO: Should we enforce uniqueness of field names?
if !utf8.Valid(f.Name) {
return nil, fmt.Errorf("document contains invalid field name: %v", f.Name)
return fmt.Errorf("document contains invalid field name: %v", f.Name)
}

if bytes.Equal(f.Name, IDReservedFieldName) {
if id != nil {
return nil, errMultipleIDs
}

if len(f.Value) == 0 {
return nil, errZeroLengthID
}

id = f.Value
return errReservedFieldName
}

if !utf8.Valid(f.Value) {
return nil, fmt.Errorf("document contains invalid field value: %v", f.Value)
return fmt.Errorf("document contains invalid field value: %v", f.Value)
}
}

return id, nil
return nil
}

// HasID returns a bool indicating whether the document has an ID or not.
func (d Document) HasID() bool {
return len(d.ID) > 0
}
84 changes: 72 additions & 12 deletions doc/document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ func TestDocumentEquality(t *testing.T) {
{
name: "documents with the same fields in the same order are equal",
l: Document{
ID: []byte("831992"),
Fields: []Field{
Field{
Name: []byte("apple"),
Expand All @@ -171,6 +172,7 @@ func TestDocumentEquality(t *testing.T) {
},
},
r: Document{
ID: []byte("831992"),
Fields: []Field{
Field{
Name: []byte("apple"),
Expand All @@ -187,6 +189,7 @@ func TestDocumentEquality(t *testing.T) {
{
name: "documents with the same fields in different order are equal",
l: Document{
ID: []byte("831992"),
Fields: []Field{
Field{
Name: []byte("banana"),
Expand All @@ -199,6 +202,7 @@ func TestDocumentEquality(t *testing.T) {
},
},
r: Document{
ID: []byte("831992"),
Fields: []Field{
Field{
Name: []byte("apple"),
Expand All @@ -215,6 +219,7 @@ func TestDocumentEquality(t *testing.T) {
{
name: "documents with different fields are unequal",
l: Document{
ID: []byte("831992"),
Fields: []Field{
Field{
Name: []byte("apple"),
Expand All @@ -227,6 +232,7 @@ func TestDocumentEquality(t *testing.T) {
},
},
r: Document{
ID: []byte("831992"),
Fields: []Field{
Field{
Name: []byte("apple"),
Expand All @@ -240,6 +246,28 @@ func TestDocumentEquality(t *testing.T) {
},
expected: false,
},
{
name: "documents with different IDs are unequal",
l: Document{
ID: []byte("831992"),
Fields: []Field{
Field{
Name: []byte("apple"),
Value: []byte("red"),
},
},
},
r: Document{
ID: []byte("080292"),
Fields: []Field{
Field{
Name: []byte("apple"),
Value: []byte("red"),
},
},
},
expected: false,
},
}

for _, test := range tests {
Expand All @@ -254,7 +282,6 @@ func TestDocumentValidation(t *testing.T) {
name string
input Document
expectedErr bool
expectedID []byte
}{
{
name: "empty document",
Expand Down Expand Up @@ -286,46 +313,79 @@ func TestDocumentValidation(t *testing.T) {
expectedErr: true,
},
{
name: "valid document with ID",
name: "document contains field with reserved field name",
input: Document{
Fields: []Field{
Field{
Name: []byte("apple"),
Value: []byte("red"),
},
Field{
Name: IDReservedFieldName,
Value: []byte("123"),
},
},
},
expectedErr: false,
expectedID: nil,
expectedErr: true,
},
{
name: "valid document with ID",
name: "valid document",
input: Document{
Fields: []Field{
Field{
Name: []byte("apple"),
Value: []byte("red"),
},
Field{
Name: IDReservedFieldName,
Value: []byte("123"),
},
},
},
expectedErr: false,
expectedID: []byte("123"),
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
id, err := test.input.Validate()
err := test.input.Validate()
if test.expectedErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.Equal(t, test.expectedID, id)
})
}
}

func TestDocumentHasID(t *testing.T) {
tests := []struct {
name string
input Document
expected bool
}{
{
name: "nil ID",
input: Document{
ID: nil,
},
expected: false,
},
{
name: "zero-length ID",
input: Document{
ID: make([]byte, 0, 16),
},
expected: false,
},
{
name: "valid ID",
input: Document{
ID: []byte("831992"),
},
expected: true,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
require.Equal(t, test.expected, test.input.HasID())
})
}
}
29 changes: 17 additions & 12 deletions generated/generics/generate.sh
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
#!/bin/bash

if [ -z $GOPATH ]; then
echo 'GOPATH is not set'
exit 1
echo 'GOPATH is not set'
exit 1
fi

GENERIC_MAP_PATH=${GOPATH}/src/github.com/m3db/m3ninx/vendor/github.com/m3db/m3x/generics/hashmap
GENERIC_MAP_IMPL=${GENERIC_MAP_PATH}/map.go

if [ ! -f "$GENERIC_MAP_IMPL" ]; then
echo "${GENERIC_MAP_IMPL} does not exist"
exit 1
echo "${GENERIC_MAP_IMPL} does not exist"
exit 1
fi

GENERATED_PATH=${GOPATH}/src/github.com/m3db/m3ninx/index/segment/mem
if [ ! -d "$GENERATED_PATH" ]; then
echo "${GENERATED_PATH} does not exist"
exit 1
echo "${GENERATED_PATH} does not exist"
exit 1
fi

# NB: We use (genny)[1] to combat the lack of generics in Go. It allows us
Expand All @@ -32,11 +32,16 @@ fi
# [1]: https://github.com/cheekybits/genny

mkdir -p $GENERATED_PATH/postingsgen
cat $GENERIC_MAP_IMPL \
| genny -out=${GENERATED_PATH}/postingsgen/generated_map.go \
-pkg=postingsgen gen "KeyType=[]byte ValueType=postings.MutableList"
cat $GENERIC_MAP_IMPL \
| genny -out=${GENERATED_PATH}/postingsgen/generated_map.go \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While you're here, could you change the name to .../postingsgen/map_gen.go (for this one and others below) . _gen.go is the convention Rob started using in m3x/m3db recently.

And also update .excludecoverage

-pkg=postingsgen gen "KeyType=[]byte ValueType=postings.MutableList"

mkdir -p $GENERATED_PATH/fieldsgen
cat $GENERIC_MAP_IMPL \
| genny -out=${GENERATED_PATH}/fieldsgen/generated_map.go \
-pkg=fieldsgen gen "KeyType=[]byte ValueType=*postingsgen.ConcurrentMap"
cat $GENERIC_MAP_IMPL \
| genny -out=${GENERATED_PATH}/fieldsgen/generated_map.go \
-pkg=fieldsgen gen "KeyType=[]byte ValueType=*postingsgen.ConcurrentMap"

mkdir -p $GENERATED_PATH/idsgen
cat $GENERIC_MAP_IMPL \
| genny -out=${GENERATED_PATH}/idsgen/generated_map.go \
-pkg=idsgen gen "KeyType=[]byte ValueType=struct{}"
20 changes: 16 additions & 4 deletions index/encoding/doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,23 @@ The payload comes after the header and it contains the actual documents.

#### Document

Each document is composed of fields. The number of fields in the document is encoded
first as a variable-sized unsigned integer and then the fields themselves are encoded.
Each document is composed of an ID and fields. The ID is a sequence of valid UTF-8 bytes
and it is encoded first by encoding the length of the ID, in bytes, as a variable-sized
unsigned integer and then encoding the actual bytes which comprise the ID. Following
the ID are the fields. The number of fields in the document is encoded first as a
variable-sized unsigned integer and then the fields themselves are encoded.

```
┌───────────────────────────┐
│ ┌───────────────────────┐ │
│ │ Length of ID │ │
│ │ (uvarint) │ │
│ ├───────────────────────┤ │
│ │ │ │
│ │ ID │ │
│ │ (bytes) │ │
│ │ │ │
│ ├───────────────────────┤ │
│ │ Number of Fields │ │
│ │ (uvarint) │ │
│ ├───────────────────────┤ │
Expand All @@ -94,8 +105,9 @@ first as a variable-sized unsigned integer and then the fields themselves are en
#### Field

Each field is composed of a name and a value. The name and value are a sequence of valid
UTF-8 bytes and they are encoded by prefixing the sequence of bytes with its length as
a variable-sized unsigned integer. The name is encoded first and the value second.
UTF-8 bytes and they are encoded by encoding the length of the name (value), in bytes, as
a variable-sized unsigned integer and then encoding the actual bytes which comprise the
name (value). The name is encoded first and the value second.

```
┌───────────────────────────┐
Expand Down
9 changes: 8 additions & 1 deletion index/encoding/doc/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,9 @@ func (w *writer) Open() error {
}

func (w *writer) Write(d doc.Document) error {
w.enc.PutUvarint(uint64(len(d.Fields)))
w.enc.PutBytes(d.ID)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The encoder/decoder is beginning to look super similar to the types in https://github.com/m3db/m3db/blob/master/serialize/types.go

Any chance we can consolidate?

Copy link
Contributor Author

@jeromefroe jeromefroe Apr 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not opposed to consolidating, though given that they live in separate repos that may be difficult at the moment. Perhaps we can track in an issue?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, #36


w.enc.PutUvarint(uint64(len(d.Fields)))
for _, f := range d.Fields {
w.enc.PutBytes(f.Name)
w.enc.PutBytes(f.Value)
Expand Down Expand Up @@ -228,13 +229,19 @@ func (r *reader) Read() (doc.Document, error) {
return doc.Document{}, io.EOF
}

id, err := r.dec.Bytes()
if err != nil {
return doc.Document{}, err
}

x, err := r.dec.Uvarint()
if err != nil {
return doc.Document{}, err
}
n := int(x)

d := doc.Document{
ID: id,
Fields: make([]doc.Field, n),
}

Expand Down
2 changes: 2 additions & 0 deletions index/encoding/doc/doc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ func TestWriteReadDocuments(t *testing.T) {
name: "standard documents",
docs: []doc.Document{
doc.Document{
ID: []byte("831992"),
Fields: []doc.Field{
doc.Field{
Name: []byte("fruit"),
Expand All @@ -60,6 +61,7 @@ func TestWriteReadDocuments(t *testing.T) {
},
},
doc.Document{
ID: []byte("080392"),
Fields: []doc.Field{
doc.Field{
Name: []byte("fruit"),
Expand Down
Loading