Bump versions again to get LangChain extractors

datastax · Jul 22, 2024 · 4d77dce · 4d77dce
1 parent 8da4caa
commit 4d77dce
Show file tree

Hide file tree

Showing 15 changed files with 80 additions and 572 deletions.
diff --git a/libs/e2e-tests/pyproject.llamaindex.toml b/libs/e2e-tests/pyproject.llamaindex.toml
@@ -42,9 +42,9 @@ llama-index-multi-modal-llms-gemini = { git = "https://github.com/run-llama/llam
 
 llama-parse = { git = "https://github.com/run-llama/llama_parse.git", branch = "main" }
 
-langchain = "0.2.7"
-langchain-core = "0.2.12"
-langchain-community = "0.2.7"
+langchain = "0.2.10"
+langchain-core = "0.2.22"
+langchain-community = "0.2.9"
 langchain-astradb = "0.3.3"
 langchain-openai = "0.1.8"
 langchain-google-genai = { version = "1.0.6" }

diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
@@ -18,9 +18,9 @@ ragstack-ai-colbert = { version = "1.0.5", optional = true }
 ragstack-ai-knowledge-store = { version = "0.1.0", optional = true }
 
 # langchain
-langchain = "0.2.7"
-langchain-core = "0.2.12"
-langchain-community = "0.2.7"
+langchain = "0.2.10"
+langchain-core = "0.2.22"
+langchain-community = "0.2.9"
 langchain-astradb = "0.3.3"
 langchain-openai = "0.1.8"
 langchain-google-genai = { version = "1.0.6", optional = true }

diff --git a/libs/langchain/ragstack_langchain/graph_store/cassandra.py b/libs/langchain/ragstack_langchain/graph_store/cassandra.py
@@ -2,7 +2,6 @@
     CassandraGraphVectorStore as CassandraGraphStore,
 )
 
-
 __all__ = [
     "CassandraGraphStore",
 ]
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py
@@ -1,57 +1,9 @@
-from typing import Any, Dict, Iterable, List, Optional, Set
-
-from langchain_core.graph_vectorstores import Link
-
-from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
-
-# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
-GLiNERInput = str
-
-
-class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
-    def __init__(
-        self,
-        labels: List[str],
-        *,
-        kind: str = "entity",
-        model: str = "urchade/gliner_mediumv2.1",
-        extract_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        """Extract keywords using GLiNER.
-
-        Args:
-            kind: Kind of links to produce with this extractor.
-            labels: List of kinds of entities to extract.
-            model: GLiNER model to use.
-            extract_kwargs: Keyword arguments to pass to GLiNER.
-        """
-        try:
-            from gliner import GLiNER
-
-            self._model = GLiNER.from_pretrained(model)
-
-        except ImportError:
-            raise ImportError(
-                "gliner is required for GLiNERLinkExtractor. "
-                "Please install it with `pip install gliner`."
-            ) from None
-
-        self._labels = labels
-        self._kind = kind
-        self._extract_kwargs = extract_kwargs or {}
-
-    def extract_one(self, input: GLiNERInput) -> Set[Link]:  # noqa: A002
-        return next(self.extract_many([input]))
-
-    def extract_many(
-        self,
-        inputs: Iterable[GLiNERInput],
-    ) -> Iterable[Set[Link]]:
-        strs = [i if isinstance(i, str) else i.page_content for i in inputs]
-        for entities in self._model.batch_predict_entities(
-            strs, self._labels, **self._extract_kwargs
-        ):
-            yield {
-                Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
-                for e in entities
-            }
+from langchain_community.graph_vectorstores.extractors import (
+    GLiNERInput,
+    GLiNERLinkExtractor,
+)
+
+__all__ = [
+    "GLiNERInput",
+    "GLiNERLinkExtractor",
+]
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py
@@ -1,61 +1,9 @@
-from typing import Callable, List, Set
-
-from langchain_core.documents import Document
-from langchain_core.graph_vectorstores import Link
-
-from .link_extractor import LinkExtractor
-from .link_extractor_adapter import LinkExtractorAdapter
-
-# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
-HierarchyInput = List[str]
-
-
-class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
-    def __init__(
-        self,
-        kind: str = "hierarchy",
-        up_links: bool = True,
-        down_links: bool = False,
-        sibling_links: bool = False,
-    ):
-        """Extract links from a document hierarchy.
-
-        Args:
-            kind: Kind of links to produce with this extractor.
-            up_links: Link from a section to it's parent.
-            down_links: Link from a section to it's children.
-            sibling_links: Link from a section to other sections with the same parent.
-        """
-        self._kind = kind
-        self._up_links = up_links
-        self._down_links = down_links
-        self._sibling_links = sibling_links
-
-    def as_document_extractor(
-        self, hierarchy: Callable[[Document], HierarchyInput]
-    ) -> LinkExtractor[Document]:
-        return LinkExtractorAdapter(underlying=self, transform=hierarchy)
-
-    def extract_one(
-        self,
-        input: HierarchyInput,  # noqa: A002
-    ) -> Set[Link]:
-        this_path = "/".join(input)
-        parent_path = None
-
-        links = set()
-        if self._up_links:
-            links.add(Link.incoming(kind=self._kind, tag=f"up:{this_path}"))
-        if self._down_links:
-            links.add(Link.outgoing(kind=self._kind, tag=f"down:{this_path}"))
-
-        if len(input) >= 1:
-            parent_path = "/".join(input[0:-1])
-            if self._up_links and len(input) > 1:
-                links.add(Link.outgoing(kind=self._kind, tag=f"up:{parent_path}"))
-            if self._down_links and len(input) > 1:
-                links.add(Link.incoming(kind=self._kind, tag=f"down:{parent_path}"))
-            if self._sibling_links:
-                links.add(Link.bidir(kind=self._kind, tag=f"sib:{parent_path}"))
-
-        return links
+from langchain_community.graph_vectorstores.extractors import (
+    HierarchyInput,
+    HierarchyLinkExtractor,
+)
+
+__all__ = [
+    "HierarchyInput",
+    "HierarchyLinkExtractor",
+]
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py
@@ -1,119 +1,9 @@
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Set, Union
-from urllib.parse import urldefrag, urljoin, urlparse
-
-from langchain_core.documents import Document
-from langchain_core.graph_vectorstores import Link
-
-from .link_extractor import LinkExtractor
-from .link_extractor_adapter import LinkExtractorAdapter
-
-if TYPE_CHECKING:
-    from bs4 import BeautifulSoup
-
-
-def _parse_url(link, page_url, drop_fragments: bool = True):
-    href = link.get("href")
-    if href is None:
-        return None
-    url = urlparse(href)
-    if url.scheme not in ["http", "https", ""]:
-        return None
-
-    # Join the HREF with the page_url to convert relative paths to absolute.
-    url = urljoin(page_url, href)
-
-    # Fragments would be useful if we chunked a page based on section.
-    # Then, each chunk would have a different URL based on the fragment.
-    # Since we aren't doing that yet, they just "break" links. So, drop
-    # the fragment.
-    if drop_fragments:
-        return urldefrag(url).url
-    return url
-
-
-def _parse_hrefs(
-    soup: "BeautifulSoup", url: str, drop_fragments: bool = True
-) -> Set[str]:
-    links = soup.find_all("a")
-    links = {
-        _parse_url(link, page_url=url, drop_fragments=drop_fragments) for link in links
-    }
-
-    # Remove entries for any 'a' tag that failed to parse (didn't have href,
-    # or invalid domain, etc.)
-    links.discard(None)
-
-    # Remove self links.
-    links.discard(url)
-
-    return links
-
-
-@dataclass
-class HtmlInput:
-    content: Union[str, "BeautifulSoup"]
-    base_url: str
-
-
-class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
-    def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
-        """Extract hyperlinks from HTML content.
-
-        Expects the input to be an HTML string or a `BeautifulSoup` object.
-
-        Args:
-            kind: The kind of edge to extract. Defaults to "hyperlink".
-            drop_fragments: Whether fragments in URLs and links shoud be
-                dropped. Defaults to `True`.
-        """
-        try:
-            import bs4  # noqa:F401
-        except ImportError as e:
-            raise ImportError(
-                "BeautifulSoup4 is required for HtmlLinkExtractor. "
-                "Please install it with `pip install beautifulsoup4`."
-            ) from e
-
-        self._kind = kind
-        self.drop_fragments = drop_fragments
-
-    def as_document_extractor(
-        self, url_metadata_key: str = "source"
-    ) -> LinkExtractor[Document]:
-        """Return a LinkExtractor that applies to documents.
-
-        NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
-        link extractors it may be more efficient to call the link extractors directly
-        on the parsed BeautifulSoup object.
-
-        Args:
-            url_metadata_key: The name of the filed in document metadata with the URL of
-                the document.
-        """
-        return LinkExtractorAdapter(
-            underlying=self,
-            transform=lambda doc: HtmlInput(
-                doc.page_content, doc.metadata[url_metadata_key]
-            ),
-        )
-
-    def extract_one(
-        self,
-        input: HtmlInput,  # noqa: A002
-    ) -> Set[Link]:
-        content = input.content
-        if isinstance(content, str):
-            from bs4 import BeautifulSoup
-
-            content = BeautifulSoup(content, "html.parser")
-
-        base_url = input.base_url
-        if self.drop_fragments:
-            base_url = urldefrag(base_url).url
-
-        hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
-
-        links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
-        links.add(Link.incoming(kind=self._kind, tag=base_url))
-        return links
+from langchain_community.graph_vectorstores.extractors import (
+    HtmlInput,
+    HtmlLinkExtractor,
+)
+
+__all__ = [
+    "HtmlInput",
+    "HtmlLinkExtractor",
+]
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py
@@ -1,63 +1,9 @@
-from typing import Any, Dict, Iterable, Optional, Set, Union
-
-from langchain_core.documents import Document
-from langchain_core.graph_vectorstores import Link
-
-from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
-
-# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
-KeybertInput = Union[str, Document]
-
-
-class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
-    def __init__(
-        self,
-        *,
-        kind: str = "kw",
-        embedding_model: str = "all-MiniLM-L6-v2",
-        extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        """Extract keywords using Keybert.
-
-        Args:
-            kind: Kind of links to produce with this extractor.
-            embedding_model: Name of the embedding model to use with Keybert.
-            extract_keywords_kwargs: Keyword arguments to pass to Keybert's
-                `extract_keywords` method.
-        """
-        try:
-            import keybert
-
-            self._kw_model = keybert.KeyBERT(model=embedding_model)
-        except ImportError:
-            raise ImportError(
-                "keybert is required for KeybertLinkExtractor. "
-                "Please install it with `pip install keybert`."
-            ) from None
-
-        self._kind = kind
-        self._extract_keywords_kwargs = extract_keywords_kwargs or {}
-
-    def extract_one(self, input: KeybertInput) -> Set[Link]:  # noqa: A002
-        keywords = self._kw_model.extract_keywords(
-            input if isinstance(input, str) else input.page_content,
-            **self._extract_keywords_kwargs,
-        )
-        return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
-
-    def extract_many(
-        self,
-        inputs: Iterable[KeybertInput],
-    ) -> Iterable[Set[Link]]:
-        if len(inputs) == 1:
-            # Even though we pass a list, if it contains one item, keybert will
-            # flatten it. This means it's easier to just call the special case
-            # for one item.
-            yield self.extract_one(inputs[0])
-        elif len(inputs) > 1:
-            strs = [i if isinstance(i, str) else i.page_content for i in inputs]
-            extracted = self._kw_model.extract_keywords(
-                strs, **self._extract_keywords_kwargs
-            )
-            for keywords in extracted:
-                yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
+from langchain_community.graph_vectorstores.extractors import (
+    KeybertInput,
+    KeybertLinkExtractor,
+)
+
+__all__ = [
+    "KeybertInput",
+    "KeybertLinkExtractor",
+]