Skip to content

Commit

Permalink
.Net: Create a getting started project for vector stores. (#9489)
Browse files Browse the repository at this point in the history
### Motivation and Context

#7606

Only three steps so far, more to be added in a subsequent PR.

### Description

- Add a getting started project for vector stores
- Add three initial steps, more to follow
- Update README files

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone 😄
  • Loading branch information
westey-m authored Nov 4, 2024
1 parent aec6577 commit f88cf5f
Show file tree
Hide file tree
Showing 9 changed files with 457 additions and 11 deletions.
15 changes: 12 additions & 3 deletions dotnet/SK-dotnet.sln
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,11 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TimePlugin", "samples\Demos
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Memory.AzureCosmosDBNoSQL", "src\Connectors\Connectors.Memory.AzureCosmosDBNoSQL\Connectors.Memory.AzureCosmosDBNoSQL.csproj", "{B0B3901E-AF56-432B-8FAA-858468E5D0DF}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Connectors.Amazon", "src\Connectors\Connectors.Amazon\Connectors.Amazon.csproj", "{E059E9B0-1302-474D-B1B5-10A6E0F1A769}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Amazon", "src\Connectors\Connectors.Amazon\Connectors.Amazon.csproj", "{E059E9B0-1302-474D-B1B5-10A6E0F1A769}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AmazonBedrockAIModels", "samples\Demos\AmazonBedrockModels\AmazonBedrockAIModels.csproj", "{ABEAACCD-CF63-4850-8ED5-E01379DBFC46}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AmazonBedrockAIModels", "samples\Demos\AmazonBedrockModels\AmazonBedrockAIModels.csproj", "{ABEAACCD-CF63-4850-8ED5-E01379DBFC46}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Connectors.Amazon.UnitTests", "src\Connectors\Connectors.Amazon.UnitTests\Connectors.Amazon.UnitTests.csproj", "{CCC6DC57-2AC1-4C8E-A448-2CC0537A288E}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Amazon.UnitTests", "src\Connectors\Connectors.Amazon.UnitTests\Connectors.Amazon.UnitTests.csproj", "{CCC6DC57-2AC1-4C8E-A448-2CC0537A288E}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Redis.UnitTests", "src\Connectors\Connectors.Redis.UnitTests\Connectors.Redis.UnitTests.csproj", "{1D4667B9-9381-4E32-895F-123B94253EE8}"
EndProject
Expand Down Expand Up @@ -409,6 +409,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SemanticKernel.AotTests", "
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Process.Utilities.UnitTests", "src\Experimental\Process.Utilities.UnitTests\Process.Utilities.UnitTests.csproj", "{DAC54048-A39A-4739-8307-EA5A291F2EA0}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "GettingStartedWithVectorStores", "samples\GettingStartedWithVectorStores\GettingStartedWithVectorStores.csproj", "{8C3DE41C-E2C8-42B9-8638-574F8946EB0E}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -1070,6 +1072,12 @@ Global
{DAC54048-A39A-4739-8307-EA5A291F2EA0}.Publish|Any CPU.Build.0 = Debug|Any CPU
{DAC54048-A39A-4739-8307-EA5A291F2EA0}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DAC54048-A39A-4739-8307-EA5A291F2EA0}.Release|Any CPU.Build.0 = Release|Any CPU
{8C3DE41C-E2C8-42B9-8638-574F8946EB0E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{8C3DE41C-E2C8-42B9-8638-574F8946EB0E}.Debug|Any CPU.Build.0 = Debug|Any CPU
{8C3DE41C-E2C8-42B9-8638-574F8946EB0E}.Publish|Any CPU.ActiveCfg = Debug|Any CPU
{8C3DE41C-E2C8-42B9-8638-574F8946EB0E}.Publish|Any CPU.Build.0 = Debug|Any CPU
{8C3DE41C-E2C8-42B9-8638-574F8946EB0E}.Release|Any CPU.ActiveCfg = Release|Any CPU
{8C3DE41C-E2C8-42B9-8638-574F8946EB0E}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -1217,6 +1225,7 @@ Global
{6ECFDF04-2237-4A85-B114-DAA34923E9E6} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263}
{39EAB599-742F-417D-AF80-95F90376BB18} = {831DDCA2-7D2C-4C31-80DB-6BDB3E1F7AE0}
{DAC54048-A39A-4739-8307-EA5A291F2EA0} = {0D8C6358-5DAA-4EA6-A924-C268A9A21BC9}
{8C3DE41C-E2C8-42B9-8638-574F8946EB0E} = {FA3720F1-C99A-49B2-9577-A940257098BF}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {FBDC56A3-86AD-4323-AA0F-201E59123B83}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<AssemblyName>GettingStartedWithVectorStores</AssemblyName>
<RootNamespace></RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<TargetFramework>net8.0</TargetFramework>
<IsTestProject>true</IsTestProject>
<IsPackable>false</IsPackable>
<!-- Suppress: "Declare types in namespaces", "Require ConfigureAwait", "Experimental" -->
<NoWarn>$(NoWarn);CS8618,IDE0009,CA1051,CA1050,CA1707,CA1054,CA2007,VSTHRD111,CS1591,RCS1110,RCS1243,CA5394,SKEXP0001,SKEXP0010,SKEXP0020,SKEXP0040,SKEXP0050,SKEXP0060,SKEXP0070,SKEXP0101</NoWarn>
<OutputType>Library</OutputType>
<UserSecretsId>5ee045b0-aea3-4f08-8d31-32d1a6f8fed0</UserSecretsId>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="xRetry" />
<PackageReference Include="xunit" />
<PackageReference Include="xunit.abstractions" />
<PackageReference Include="xunit.runner.visualstudio">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="Azure.Identity" />
<PackageReference Include="Microsoft.Extensions.Configuration" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" />
<PackageReference Include="Microsoft.Extensions.Configuration.EnvironmentVariables" />
<PackageReference Include="Microsoft.Extensions.Configuration.Json" />
<PackageReference Include="Microsoft.Extensions.Configuration.UserSecrets" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection" />
<PackageReference Include="Microsoft.Extensions.Http" />
<PackageReference Include="Microsoft.Extensions.Http.Resilience" />
<PackageReference Include="Microsoft.Extensions.Logging" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Console" />
<PackageReference Include="System.Linq.Async" />
<PackageReference Include="System.Text.Json" />
</ItemGroup>

<Import Project="$(RepoRoot)/dotnet/src/InternalUtilities/samples/SamplesInternalUtilities.props" />

<ItemGroup>
<ProjectReference Include="..\..\src\Connectors\Connectors.AzureOpenAI\Connectors.AzureOpenAI.csproj" />
<ProjectReference Include="..\..\src\Connectors\Connectors.Memory.AzureAISearch\Connectors.Memory.AzureAISearch.csproj" />
<ProjectReference Include="..\..\src\Connectors\Connectors.Memory.InMemory\Connectors.Memory.InMemory.csproj" />
<ProjectReference Include="..\..\src\Connectors\Connectors.Memory.Redis\Connectors.Memory.Redis.csproj" />
<ProjectReference Include="..\..\src\SemanticKernel.Abstractions\SemanticKernel.Abstractions.csproj" />
<ProjectReference Include="..\..\src\SemanticKernel.Core\SemanticKernel.Core.csproj" />
</ItemGroup>
<ItemGroup>
<Using Include="Xunit" />
<Using Include="Xunit.Abstractions" />
</ItemGroup>
</Project>
30 changes: 30 additions & 0 deletions dotnet/samples/GettingStartedWithVectorStores/Glossary.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.Extensions.VectorData;

namespace GettingStartedWithVectorStores;

/// <summary>
/// Sample model class that represents a glossary entry.
/// </summary>
/// <remarks>
/// Note that each property is decorated with an attribute that specifies how the property should be treated by the vector store.
/// This allows us to create a collection in the vector store and upsert and retrieve instances of this class without any further configuration.
/// </remarks>
internal sealed class Glossary
{
[VectorStoreRecordKey]
public string Key { get; set; }

[VectorStoreRecordData(IsFilterable = true)]
public string Category { get; set; }

[VectorStoreRecordData]
public string Term { get; set; }

[VectorStoreRecordData]
public string Definition { get; set; }

[VectorStoreRecordVector(Dimensions: 1536)]
public ReadOnlyMemory<float> DefinitionEmbedding { get; set; }
}
37 changes: 37 additions & 0 deletions dotnet/samples/GettingStartedWithVectorStores/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Starting With Semantic Kernel Vector Stores

This project contains a step by step guide to get started using Vector Stores with the Semantic Kernel.

The examples can be run as integration tests but their code can also be copied to stand-alone programs.

## Configuring Secrets

Most of the examples will require secrets and credentials, to access OpenAI, Azure OpenAI,
Vector Stores and other resources. We suggest using .NET
[Secret Manager](https://learn.microsoft.com/aspnet/core/security/app-secrets)
to avoid the risk of leaking secrets into the repository, branches and pull requests.
You can also use environment variables if you prefer.

To set your secrets with Secret Manager:

```
cd dotnet/samples/GettingStartedWithVectorStores
dotnet user-secrets init
dotnet user-secrets set "AzureOpenAIEmbeddings:DeploymentName" "..."
dotnet user-secrets set "AzureOpenAIEmbeddings:Endpoint" "..."
dotnet user-secrets set "AzureAISearch:Endpoint" "..."
dotnet user-secrets set "AzureAISearch:ApiKey" "..."
```

To set your secrets with environment variables, use these names:

```
AzureOpenAIEmbeddings__DeploymentName
AzureOpenAIEmbeddings__Endpoint
AzureAISearch__Endpoint
AzureAISearch__ApiKey
```
112 changes: 112 additions & 0 deletions dotnet/samples/GettingStartedWithVectorStores/Step1_Ingest_Data.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.Extensions.VectorData;
using Microsoft.SemanticKernel.Connectors.InMemory;
using Microsoft.SemanticKernel.Embeddings;

namespace GettingStartedWithVectorStores;

/// <summary>
/// Example showing how to generate embeddings and ingest data into an in-memory vector store.
/// </summary>
public class Step1_Ingest_Data(ITestOutputHelper output, VectorStoresFixture fixture) : BaseTest(output), IClassFixture<VectorStoresFixture>
{
/// <summary>
/// Example showing how to ingest data into an in-memory vector store.
/// </summary>
[Fact]
public async Task IngestDataIntoInMemoryVectorStoreAsync()
{
// Construct the vector store and get the collection.
var vectorStore = new InMemoryVectorStore();
var collection = vectorStore.GetCollection<string, Glossary>("skglossary");

// Ingest data into the collection.
await IngestDataIntoVectorStoreAsync(collection, fixture.TextEmbeddingGenerationService);

// Retrieve an item from the collection and write it to the console.
var record = await collection.GetAsync("4");
Console.WriteLine(record!.Definition);
}

/// <summary>
/// Ingest data into the given collection.
/// </summary>
/// <param name="collection">The collection to ingest data into.</param>
/// <param name="textEmbeddingGenerationService">The service to use for generating embeddings.</param>
/// <returns>The keys of the upserted records.</returns>
internal static async Task<IEnumerable<string>> IngestDataIntoVectorStoreAsync(
IVectorStoreRecordCollection<string, Glossary> collection,
ITextEmbeddingGenerationService textEmbeddingGenerationService)
{
// Create the collection if it doesn't exist.
await collection.CreateCollectionIfNotExistsAsync();

// Create glossary entries and generate embeddings for them.
var glossaryEntries = CreateGlossaryEntries().ToList();
var tasks = glossaryEntries.Select(entry => Task.Run(async () =>
{
entry.DefinitionEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(entry.Definition);
}));
await Task.WhenAll(tasks);

// Upsert the glossary entries into the collection and return their keys.
var upsertedKeysTasks = glossaryEntries.Select(x => collection.UpsertAsync(x));
return await Task.WhenAll(upsertedKeysTasks);
}

/// <summary>
/// Create some sample glossary entries.
/// </summary>
/// <returns>A list of sample glossary entries.</returns>
private static IEnumerable<Glossary> CreateGlossaryEntries()
{
yield return new Glossary
{
Key = "1",
Category = "Software",
Term = "API",
Definition = "Application Programming Interface. A set of rules and specifications that allow software components to communicate and exchange data."
};

yield return new Glossary
{
Key = "2",
Category = "Software",
Term = "SDK",
Definition = "Software development kit. A set of libraries and tools that allow software developers to build software more easily."
};

yield return new Glossary
{
Key = "3",
Category = "SK",
Term = "Connectors",
Definition = "Semantic Kernel Connectors allow software developers to integrate with various services providing AI capabilities, including LLM, AudioToText, TextToAudio, Embedding generation, etc."
};

yield return new Glossary
{
Key = "4",
Category = "SK",
Term = "Semantic Kernel",
Definition = "Semantic Kernel is a set of libraries that allow software developers to more easily develop applications that make use of AI experiences."
};

yield return new Glossary
{
Key = "5",
Category = "AI",
Term = "RAG",
Definition = "Retrieval Augmented Generation - a term that refers to the process of retrieving additional data to provide as context to an LLM to use when generating a response (completion) to a user’s question (prompt)."
};

yield return new Glossary
{
Key = "6",
Category = "AI",
Term = "LLM",
Definition = "Large language model. A type of artificial ingelligence algorithm that is designed to understand and generate human language."
};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.Extensions.VectorData;
using Microsoft.SemanticKernel.Connectors.InMemory;
using Microsoft.SemanticKernel.Embeddings;

namespace GettingStartedWithVectorStores;

/// <summary>
/// Example showing how to do vector searches with an in-memory vector store.
/// </summary>
public class Step2_Vector_Search(ITestOutputHelper output, VectorStoresFixture fixture) : BaseTest(output), IClassFixture<VectorStoresFixture>
{
/// <summary>
/// Do a basic vector search where we just want to retrieve the single most relevant result.
/// </summary>
[Fact]
public async Task SearchAnInMemoryVectorStoreAsync()
{
var collection = await GetVectorStoreCollectionWithDataAsync();

// Search the vector store.
var searchResultItem = await SearchVectorStoreAsync(
collection,
"What is an Application Programming Interface?",
fixture.TextEmbeddingGenerationService);

// Write the search result with its score to the console.
Console.WriteLine(searchResultItem.Record.Definition);
Console.WriteLine(searchResultItem.Score);
}

/// <summary>
/// Search the given collection for the most relevant result to the given search string.
/// </summary>
/// <param name="collection">The collection to search.</param>
/// <param name="searchString">The string to search matches for.</param>
/// <param name="textEmbeddingGenerationService">The service to generate embeddings with.</param>
/// <returns>The top search result.</returns>
internal static async Task<VectorSearchResult<Glossary>> SearchVectorStoreAsync(IVectorStoreRecordCollection<string, Glossary> collection, string searchString, ITextEmbeddingGenerationService textEmbeddingGenerationService)
{
// Generate an embedding from the search string.
var searchVector = await textEmbeddingGenerationService.GenerateEmbeddingAsync(searchString);

// Search the store and get the single most relevant result.
var searchResult = await collection.VectorizedSearchAsync(
searchVector,
new()
{
Top = 1
});
var searchResultItems = await searchResult.Results.ToListAsync();
return searchResultItems.First();
}

/// <summary>
/// Do a more complex vector search with pre-filtering.
/// </summary>
[Fact]
public async Task SearchAnInMemoryVectorStoreWithFilteringAsync()
{
var collection = await GetVectorStoreCollectionWithDataAsync();

// Generate an embedding from the search string.
var searchString = "How do I provide additional context to an LLM?";
var searchVector = await fixture.TextEmbeddingGenerationService.GenerateEmbeddingAsync(searchString);

// Search the store with a filter and get the single most relevant result.
var searchResult = await collection.VectorizedSearchAsync(
searchVector,
new()
{
Top = 1,
Filter = new VectorSearchFilter().EqualTo(nameof(Glossary.Category), "AI")
});
var searchResultItems = await searchResult.Results.ToListAsync();

// Write the search result with its score to the console.
Console.WriteLine(searchResultItems.First().Record.Definition);
Console.WriteLine(searchResultItems.First().Score);
}

private async Task<IVectorStoreRecordCollection<string, Glossary>> GetVectorStoreCollectionWithDataAsync()
{
// Construct the vector store and get the collection.
var vectorStore = new InMemoryVectorStore();
var collection = vectorStore.GetCollection<string, Glossary>("skglossary");

// Ingest data into the collection using the code from step 1.
await Step1_Ingest_Data.IngestDataIntoVectorStoreAsync(collection, fixture.TextEmbeddingGenerationService);

return collection;
}
}
Loading

0 comments on commit f88cf5f

Please sign in to comment.