Skip to content

Commit

Permalink
textsplitter: add option to join table rows (#981)
Browse files Browse the repository at this point in the history
Originally the MarkdownTextSplitter would split tables into chunks
for each row (producing a header + single row) in each chunk. This
change adds an option to join multiple rows into a single chunk.

Fixes: #938
  • Loading branch information
corani authored Sep 13, 2024
1 parent f4c2abb commit 84cd854
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 12 deletions.
23 changes: 19 additions & 4 deletions textsplitter/markdown_splitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ func NewMarkdownTextSplitter(opts ...Option) *MarkdownTextSplitter {
CodeBlocks: options.CodeBlocks,
ReferenceLinks: options.ReferenceLinks,
HeadingHierarchy: options.KeepHeadingHierarchy,
JoinTableRows: options.JoinTableRows,
}

if sp.SecondSplitter == nil {
Expand Down Expand Up @@ -55,6 +56,7 @@ type MarkdownTextSplitter struct {
CodeBlocks bool
ReferenceLinks bool
HeadingHierarchy bool
JoinTableRows bool
}

// SplitText splits a text into multiple text.
Expand All @@ -71,6 +73,7 @@ func (sp MarkdownTextSplitter) SplitText(text string) ([]string, error) {
secondSplitter: sp.SecondSplitter,
renderCodeBlocks: sp.CodeBlocks,
useInlineContent: !sp.ReferenceLinks,
joinTableRows: sp.JoinTableRows,
hTitleStack: []string{},
hTitlePrependHierarchy: sp.HeadingHierarchy,
}
Expand Down Expand Up @@ -126,6 +129,10 @@ type markdownContext struct {

// useInlineContent determines whether the default inline content is rendered
useInlineContent bool

// joinTableRows determines whether a chunk should contain multiple table rows,
// or if each row in a table should be split into a separate chunk.
joinTableRows bool
}

// splitText splits Markdown text.
Expand Down Expand Up @@ -425,14 +432,22 @@ func (mc *markdownContext) splitTableRows(header []string, bodies [][]string) {
return
}

// append table header
for _, row := range bodies {
line := tableRowInMarkdown(row)

mc.joinSnippet(fmt.Sprintf("%s\n%s", headerMD, line))
// If we're at the start of the current snippet, or adding the current line would
// overflow the chunk size, prepend the header to the line (so that the new chunk
// will include the table header).
if len(mc.curSnippet) == 0 || utf8.RuneCountInString(mc.curSnippet)+utf8.RuneCountInString(line) >= mc.chunkSize {
line = fmt.Sprintf("%s\n%s", headerMD, line)
}

// keep every row in a single Document
mc.applyToChunks()
mc.joinSnippet(line)

// If we're not joining table rows, create a new chunk.
if !mc.joinTableRows {
mc.applyToChunks()
}
}
}

Expand Down
101 changes: 93 additions & 8 deletions textsplitter/markdown_splitter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,50 @@ Some content below h1>h2>h4.`,
}

// TestMarkdownHeaderTextSplitter_Table markdown always split by line.
//
//nolint:funlen
func TestMarkdownHeaderTextSplitter_Table(t *testing.T) {
t.Parallel()

type testCase struct {
name string
markdown string
options []Option
expectedDocs []schema.Document
}

testCases := []testCase{
{
name: "size(64)-overlap(32)",
options: []Option{
WithChunkSize(64),
WithChunkOverlap(32),
},
markdown: `| Syntax | Description |
| ----------- | ----------- |
| Header | Title |
| Paragraph | Text |`,
expectedDocs: []schema.Document{
{
PageContent: `| Syntax | Description |
| --- | --- |
| Header | Title |`,
Metadata: map[string]any{},
},
{
PageContent: `| Syntax | Description |
| --- | --- |
| Paragraph | Text |`,
Metadata: map[string]any{},
},
},
},
{
name: "size(512)-overlap(64)",
options: []Option{
WithChunkSize(512),
WithChunkOverlap(64),
},
markdown: `| Syntax | Description |
| ----------- | ----------- |
| Header | Title |
Expand All @@ -117,6 +153,53 @@ func TestMarkdownHeaderTextSplitter_Table(t *testing.T) {
{
PageContent: `| Syntax | Description |
| --- | --- |
| Paragraph | Text |`,
Metadata: map[string]any{},
},
},
},
{
name: "big-tables-overflow",
options: []Option{
WithChunkSize(64),
WithChunkOverlap(32),
WithJoinTableRows(true),
},
markdown: `| Syntax | Description |
| ----------- | ----------- |
| Header | Title |
| Paragraph | Text |`,
expectedDocs: []schema.Document{
{
PageContent: `| Syntax | Description |
| --- | --- |
| Header | Title |`,
Metadata: map[string]any{},
},
{
PageContent: `| Syntax | Description |
| --- | --- |
| Paragraph | Text |`,
Metadata: map[string]any{},
},
},
},
{
name: "big-tables",
options: []Option{
WithChunkSize(128),
WithChunkOverlap(32),
WithJoinTableRows(true),
},
markdown: `| Syntax | Description |
| ----------- | ----------- |
| Header | Title |
| Paragraph | Text |`,
expectedDocs: []schema.Document{
{
PageContent: `| Syntax | Description |
| --- | --- |
| Header | Title |
| Paragraph | Text |`,
Metadata: map[string]any{},
},
Expand All @@ -125,15 +208,17 @@ func TestMarkdownHeaderTextSplitter_Table(t *testing.T) {
}

for _, tc := range testCases {
splitter := NewMarkdownTextSplitter(WithChunkSize(64), WithChunkOverlap(32))
docs, err := CreateDocuments(splitter, []string{tc.markdown}, nil)
require.NoError(t, err)
assert.Equal(t, tc.expectedDocs, docs)
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

splitter = NewMarkdownTextSplitter(WithChunkSize(512), WithChunkOverlap(64))
docs, err = CreateDocuments(splitter, []string{tc.markdown}, nil)
require.NoError(t, err)
assert.Equal(t, tc.expectedDocs, docs)
rq := require.New(t)

splitter := NewMarkdownTextSplitter(tc.options...)

docs, err := CreateDocuments(splitter, []string{tc.markdown}, nil)
rq.NoError(err)
rq.Equal(tc.expectedDocs, docs)
})
}
}

Expand Down
12 changes: 12 additions & 0 deletions textsplitter/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type Options struct {
CodeBlocks bool
ReferenceLinks bool
KeepHeadingHierarchy bool // Persist hierarchy of markdown headers in each chunk
JoinTableRows bool
}

// DefaultOptions returns the default options for all text splitter.
Expand Down Expand Up @@ -145,3 +146,14 @@ func WithHeadingHierarchy(trackHeadingHierarchy bool) Option {
o.KeepHeadingHierarchy = trackHeadingHierarchy
}
}

// WithJoinTableRows sets whether tables should be split by row or not. When it is set to True,
// table rows are joined until the chunksize. When it is set to False (the default), tables are
// split by row.
//
// The default behavior is to split tables by row, so that each row is in a separate chunk.
func WithJoinTableRows(join bool) Option {
return func(o *Options) {
o.JoinTableRows = join
}
}

0 comments on commit 84cd854

Please sign in to comment.