textsplitter: add option to join table rows (#981)

Originally the MarkdownTextSplitter would split tables into chunks for each row (producing a header + single row) in each chunk. This change adds an option to join multiple rows into a single chunk. Fixes: #938
tmc · Sep 13, 2024 · 84cd854 · 84cd854
1 parent f4c2abb
commit 84cd854
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 12 deletions.
diff --git a/textsplitter/markdown_splitter.go b/textsplitter/markdown_splitter.go
@@ -24,6 +24,7 @@ func NewMarkdownTextSplitter(opts ...Option) *MarkdownTextSplitter {
 		CodeBlocks:       options.CodeBlocks,
 		ReferenceLinks:   options.ReferenceLinks,
 		HeadingHierarchy: options.KeepHeadingHierarchy,
+		JoinTableRows:    options.JoinTableRows,
 	}
 
 	if sp.SecondSplitter == nil {
@@ -55,6 +56,7 @@ type MarkdownTextSplitter struct {
 	CodeBlocks       bool
 	ReferenceLinks   bool
 	HeadingHierarchy bool
+	JoinTableRows    bool
 }
 
 // SplitText splits a text into multiple text.
@@ -71,6 +73,7 @@ func (sp MarkdownTextSplitter) SplitText(text string) ([]string, error) {
 		secondSplitter:         sp.SecondSplitter,
 		renderCodeBlocks:       sp.CodeBlocks,
 		useInlineContent:       !sp.ReferenceLinks,
+		joinTableRows:          sp.JoinTableRows,
 		hTitleStack:            []string{},
 		hTitlePrependHierarchy: sp.HeadingHierarchy,
 	}
@@ -126,6 +129,10 @@ type markdownContext struct {
 
 	// useInlineContent determines whether the default inline content is rendered
 	useInlineContent bool
+
+	// joinTableRows determines whether a chunk should contain multiple table rows,
+	// or if each row in a table should be split into a separate chunk.
+	joinTableRows bool
 }
 
 // splitText splits Markdown text.
@@ -425,14 +432,22 @@ func (mc *markdownContext) splitTableRows(header []string, bodies [][]string) {
 		return
 	}
 
-	// append table header
 	for _, row := range bodies {
 		line := tableRowInMarkdown(row)
 
-		mc.joinSnippet(fmt.Sprintf("%s\n%s", headerMD, line))
+		// If we're at the start of the current snippet, or adding the current line would
+		// overflow the chunk size, prepend the header to the line (so that the new chunk
+		// will include the table header).
+		if len(mc.curSnippet) == 0 || utf8.RuneCountInString(mc.curSnippet)+utf8.RuneCountInString(line) >= mc.chunkSize {
+			line = fmt.Sprintf("%s\n%s", headerMD, line)
+		}
 
-		// keep every row in a single Document
-		mc.applyToChunks()
+		mc.joinSnippet(line)
+
+		// If we're not joining table rows, create a new chunk.
+		if !mc.joinTableRows {
+			mc.applyToChunks()
+		}
 	}
 }
 

diff --git a/textsplitter/markdown_splitter_test.go b/textsplitter/markdown_splitter_test.go
@@ -95,14 +95,50 @@ Some content below h1>h2>h4.`,
 }
 
 // TestMarkdownHeaderTextSplitter_Table markdown always split by line.
+//
+//nolint:funlen
 func TestMarkdownHeaderTextSplitter_Table(t *testing.T) {
 	t.Parallel()
+
 	type testCase struct {
+		name         string
 		markdown     string
+		options      []Option
 		expectedDocs []schema.Document
 	}
+
 	testCases := []testCase{
 		{
+			name: "size(64)-overlap(32)",
+			options: []Option{
+				WithChunkSize(64),
+				WithChunkOverlap(32),
+			},
+			markdown: `| Syntax      | Description |
+| ----------- | ----------- |
+| Header      | Title       |
+| Paragraph   | Text        |`,
+			expectedDocs: []schema.Document{
+				{
+					PageContent: `| Syntax | Description |
+| --- | --- |
+| Header | Title |`,
+					Metadata: map[string]any{},
+				},
+				{
+					PageContent: `| Syntax | Description |
+| --- | --- |
+| Paragraph | Text |`,
+					Metadata: map[string]any{},
+				},
+			},
+		},
+		{
+			name: "size(512)-overlap(64)",
+			options: []Option{
+				WithChunkSize(512),
+				WithChunkOverlap(64),
+			},
 			markdown: `| Syntax      | Description |
 | ----------- | ----------- |
 | Header      | Title       |
@@ -117,6 +153,53 @@ func TestMarkdownHeaderTextSplitter_Table(t *testing.T) {
 				{
 					PageContent: `| Syntax | Description |
 | --- | --- |
+| Paragraph | Text |`,
+					Metadata: map[string]any{},
+				},
+			},
+		},
+		{
+			name: "big-tables-overflow",
+			options: []Option{
+				WithChunkSize(64),
+				WithChunkOverlap(32),
+				WithJoinTableRows(true),
+			},
+			markdown: `| Syntax      | Description |
+| ----------- | ----------- |
+| Header      | Title       |
+| Paragraph   | Text        |`,
+			expectedDocs: []schema.Document{
+				{
+					PageContent: `| Syntax | Description |
+| --- | --- |
+| Header | Title |`,
+					Metadata: map[string]any{},
+				},
+				{
+					PageContent: `| Syntax | Description |
+| --- | --- |
+| Paragraph | Text |`,
+					Metadata: map[string]any{},
+				},
+			},
+		},
+		{
+			name: "big-tables",
+			options: []Option{
+				WithChunkSize(128),
+				WithChunkOverlap(32),
+				WithJoinTableRows(true),
+			},
+			markdown: `| Syntax      | Description |
+| ----------- | ----------- |
+| Header      | Title       |
+| Paragraph   | Text        |`,
+			expectedDocs: []schema.Document{
+				{
+					PageContent: `| Syntax | Description |
+| --- | --- |
+| Header | Title |
 | Paragraph | Text |`,
 					Metadata: map[string]any{},
 				},
@@ -125,15 +208,17 @@ func TestMarkdownHeaderTextSplitter_Table(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		splitter := NewMarkdownTextSplitter(WithChunkSize(64), WithChunkOverlap(32))
-		docs, err := CreateDocuments(splitter, []string{tc.markdown}, nil)
-		require.NoError(t, err)
-		assert.Equal(t, tc.expectedDocs, docs)
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
 
-		splitter = NewMarkdownTextSplitter(WithChunkSize(512), WithChunkOverlap(64))
-		docs, err = CreateDocuments(splitter, []string{tc.markdown}, nil)
-		require.NoError(t, err)
-		assert.Equal(t, tc.expectedDocs, docs)
+			rq := require.New(t)
+
+			splitter := NewMarkdownTextSplitter(tc.options...)
+
+			docs, err := CreateDocuments(splitter, []string{tc.markdown}, nil)
+			rq.NoError(err)
+			rq.Equal(tc.expectedDocs, docs)
+		})
 	}
 }
 

diff --git a/textsplitter/options.go b/textsplitter/options.go
@@ -17,6 +17,7 @@ type Options struct {
 	CodeBlocks           bool
 	ReferenceLinks       bool
 	KeepHeadingHierarchy bool // Persist hierarchy of markdown headers in each chunk
+	JoinTableRows        bool
 }
 
 // DefaultOptions returns the default options for all text splitter.
@@ -145,3 +146,14 @@ func WithHeadingHierarchy(trackHeadingHierarchy bool) Option {
 		o.KeepHeadingHierarchy = trackHeadingHierarchy
 	}
 }
+
+// WithJoinTableRows sets whether tables should be split by row or not. When it is set to True,
+// table rows are joined until the chunksize. When it is set to False (the default), tables are
+// split by row.
+//
+// The default behavior is to split tables by row, so that each row is in a separate chunk.
+func WithJoinTableRows(join bool) Option {
+	return func(o *Options) {
+		o.JoinTableRows = join
+	}
+}