From 4d77dceabe97b05c6e79b288b94b2dbf6baa385f Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Mon, 22 Jul 2024 10:33:02 +0200 Subject: [PATCH] Bump versions again to get LangChain extractors --- libs/e2e-tests/pyproject.llamaindex.toml | 6 +- libs/langchain/pyproject.toml | 6 +- .../graph_store/cassandra.py | 1 - .../extractors/gliner_link_extractor.py | 66 ++------- .../extractors/hierarchy_link_extractor.py | 70 ++-------- .../extractors/html_link_extractor.py | 128 ++---------------- .../extractors/keybert_link_extractor.py | 72 ++-------- .../graph_store/extractors/link_extractor.py | 40 +----- .../extractors/link_extractor_adapter.py | 28 +--- .../extractors/link_extractor_transformer.py | 3 +- .../unit_tests/test_gliner_link_extractor.py | 2 +- .../test_hierarchy_link_extractor.py | 83 ------------ .../unit_tests/test_html_link_extractor.py | 106 --------------- .../unit_tests/test_keybert_link_extractor.py | 2 +- .../test_link_extractor_transformer.py | 39 ++++-- 15 files changed, 80 insertions(+), 572 deletions(-) delete mode 100644 libs/langchain/tests/unit_tests/test_hierarchy_link_extractor.py delete mode 100644 libs/langchain/tests/unit_tests/test_html_link_extractor.py diff --git a/libs/e2e-tests/pyproject.llamaindex.toml b/libs/e2e-tests/pyproject.llamaindex.toml index 5729a3266..939dd1422 100644 --- a/libs/e2e-tests/pyproject.llamaindex.toml +++ b/libs/e2e-tests/pyproject.llamaindex.toml @@ -42,9 +42,9 @@ llama-index-multi-modal-llms-gemini = { git = "https://github.com/run-llama/llam llama-parse = { git = "https://github.com/run-llama/llama_parse.git", branch = "main" } -langchain = "0.2.7" -langchain-core = "0.2.12" -langchain-community = "0.2.7" +langchain = "0.2.10" +langchain-core = "0.2.22" +langchain-community = "0.2.9" langchain-astradb = "0.3.3" langchain-openai = "0.1.8" langchain-google-genai = { version = "1.0.6" } diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index d9c72a43a..56e37f827 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -18,9 +18,9 @@ ragstack-ai-colbert = { version = "1.0.5", optional = true } ragstack-ai-knowledge-store = { version = "0.1.0", optional = true } # langchain -langchain = "0.2.7" -langchain-core = "0.2.12" -langchain-community = "0.2.7" +langchain = "0.2.10" +langchain-core = "0.2.22" +langchain-community = "0.2.9" langchain-astradb = "0.3.3" langchain-openai = "0.1.8" langchain-google-genai = { version = "1.0.6", optional = true } diff --git a/libs/langchain/ragstack_langchain/graph_store/cassandra.py b/libs/langchain/ragstack_langchain/graph_store/cassandra.py index 9e11e2148..1a73456be 100644 --- a/libs/langchain/ragstack_langchain/graph_store/cassandra.py +++ b/libs/langchain/ragstack_langchain/graph_store/cassandra.py @@ -2,7 +2,6 @@ CassandraGraphVectorStore as CassandraGraphStore, ) - __all__ = [ "CassandraGraphStore", ] diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py index fe980053f..380a90c55 100644 --- a/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py +++ b/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py @@ -1,57 +1,9 @@ -from typing import Any, Dict, Iterable, List, Optional, Set - -from langchain_core.graph_vectorstores import Link - -from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor - -# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`. -GLiNERInput = str - - -class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]): - def __init__( - self, - labels: List[str], - *, - kind: str = "entity", - model: str = "urchade/gliner_mediumv2.1", - extract_kwargs: Optional[Dict[str, Any]] = None, - ): - """Extract keywords using GLiNER. - - Args: - kind: Kind of links to produce with this extractor. - labels: List of kinds of entities to extract. - model: GLiNER model to use. - extract_kwargs: Keyword arguments to pass to GLiNER. - """ - try: - from gliner import GLiNER - - self._model = GLiNER.from_pretrained(model) - - except ImportError: - raise ImportError( - "gliner is required for GLiNERLinkExtractor. " - "Please install it with `pip install gliner`." - ) from None - - self._labels = labels - self._kind = kind - self._extract_kwargs = extract_kwargs or {} - - def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002 - return next(self.extract_many([input])) - - def extract_many( - self, - inputs: Iterable[GLiNERInput], - ) -> Iterable[Set[Link]]: - strs = [i if isinstance(i, str) else i.page_content for i in inputs] - for entities in self._model.batch_predict_entities( - strs, self._labels, **self._extract_kwargs - ): - yield { - Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"]) - for e in entities - } +from langchain_community.graph_vectorstores.extractors import ( + GLiNERInput, + GLiNERLinkExtractor, +) + +__all__ = [ + "GLiNERInput", + "GLiNERLinkExtractor", +] diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py index 077564d9c..9cabd0af2 100644 --- a/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py +++ b/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py @@ -1,61 +1,9 @@ -from typing import Callable, List, Set - -from langchain_core.documents import Document -from langchain_core.graph_vectorstores import Link - -from .link_extractor import LinkExtractor -from .link_extractor_adapter import LinkExtractorAdapter - -# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`. -HierarchyInput = List[str] - - -class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]): - def __init__( - self, - kind: str = "hierarchy", - up_links: bool = True, - down_links: bool = False, - sibling_links: bool = False, - ): - """Extract links from a document hierarchy. - - Args: - kind: Kind of links to produce with this extractor. - up_links: Link from a section to it's parent. - down_links: Link from a section to it's children. - sibling_links: Link from a section to other sections with the same parent. - """ - self._kind = kind - self._up_links = up_links - self._down_links = down_links - self._sibling_links = sibling_links - - def as_document_extractor( - self, hierarchy: Callable[[Document], HierarchyInput] - ) -> LinkExtractor[Document]: - return LinkExtractorAdapter(underlying=self, transform=hierarchy) - - def extract_one( - self, - input: HierarchyInput, # noqa: A002 - ) -> Set[Link]: - this_path = "/".join(input) - parent_path = None - - links = set() - if self._up_links: - links.add(Link.incoming(kind=self._kind, tag=f"up:{this_path}")) - if self._down_links: - links.add(Link.outgoing(kind=self._kind, tag=f"down:{this_path}")) - - if len(input) >= 1: - parent_path = "/".join(input[0:-1]) - if self._up_links and len(input) > 1: - links.add(Link.outgoing(kind=self._kind, tag=f"up:{parent_path}")) - if self._down_links and len(input) > 1: - links.add(Link.incoming(kind=self._kind, tag=f"down:{parent_path}")) - if self._sibling_links: - links.add(Link.bidir(kind=self._kind, tag=f"sib:{parent_path}")) - - return links +from langchain_community.graph_vectorstores.extractors import ( + HierarchyInput, + HierarchyLinkExtractor, +) + +__all__ = [ + "HierarchyInput", + "HierarchyLinkExtractor", +] diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py index c19180b76..d51add4c5 100644 --- a/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py +++ b/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py @@ -1,119 +1,9 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING, Set, Union -from urllib.parse import urldefrag, urljoin, urlparse - -from langchain_core.documents import Document -from langchain_core.graph_vectorstores import Link - -from .link_extractor import LinkExtractor -from .link_extractor_adapter import LinkExtractorAdapter - -if TYPE_CHECKING: - from bs4 import BeautifulSoup - - -def _parse_url(link, page_url, drop_fragments: bool = True): - href = link.get("href") - if href is None: - return None - url = urlparse(href) - if url.scheme not in ["http", "https", ""]: - return None - - # Join the HREF with the page_url to convert relative paths to absolute. - url = urljoin(page_url, href) - - # Fragments would be useful if we chunked a page based on section. - # Then, each chunk would have a different URL based on the fragment. - # Since we aren't doing that yet, they just "break" links. So, drop - # the fragment. - if drop_fragments: - return urldefrag(url).url - return url - - -def _parse_hrefs( - soup: "BeautifulSoup", url: str, drop_fragments: bool = True -) -> Set[str]: - links = soup.find_all("a") - links = { - _parse_url(link, page_url=url, drop_fragments=drop_fragments) for link in links - } - - # Remove entries for any 'a' tag that failed to parse (didn't have href, - # or invalid domain, etc.) - links.discard(None) - - # Remove self links. - links.discard(url) - - return links - - -@dataclass -class HtmlInput: - content: Union[str, "BeautifulSoup"] - base_url: str - - -class HtmlLinkExtractor(LinkExtractor[HtmlInput]): - def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True): - """Extract hyperlinks from HTML content. - - Expects the input to be an HTML string or a `BeautifulSoup` object. - - Args: - kind: The kind of edge to extract. Defaults to "hyperlink". - drop_fragments: Whether fragments in URLs and links shoud be - dropped. Defaults to `True`. - """ - try: - import bs4 # noqa:F401 - except ImportError as e: - raise ImportError( - "BeautifulSoup4 is required for HtmlLinkExtractor. " - "Please install it with `pip install beautifulsoup4`." - ) from e - - self._kind = kind - self.drop_fragments = drop_fragments - - def as_document_extractor( - self, url_metadata_key: str = "source" - ) -> LinkExtractor[Document]: - """Return a LinkExtractor that applies to documents. - - NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar - link extractors it may be more efficient to call the link extractors directly - on the parsed BeautifulSoup object. - - Args: - url_metadata_key: The name of the filed in document metadata with the URL of - the document. - """ - return LinkExtractorAdapter( - underlying=self, - transform=lambda doc: HtmlInput( - doc.page_content, doc.metadata[url_metadata_key] - ), - ) - - def extract_one( - self, - input: HtmlInput, # noqa: A002 - ) -> Set[Link]: - content = input.content - if isinstance(content, str): - from bs4 import BeautifulSoup - - content = BeautifulSoup(content, "html.parser") - - base_url = input.base_url - if self.drop_fragments: - base_url = urldefrag(base_url).url - - hrefs = _parse_hrefs(content, base_url, self.drop_fragments) - - links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs} - links.add(Link.incoming(kind=self._kind, tag=base_url)) - return links +from langchain_community.graph_vectorstores.extractors import ( + HtmlInput, + HtmlLinkExtractor, +) + +__all__ = [ + "HtmlInput", + "HtmlLinkExtractor", +] diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py index 6f67e867f..725d46991 100644 --- a/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py +++ b/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py @@ -1,63 +1,9 @@ -from typing import Any, Dict, Iterable, Optional, Set, Union - -from langchain_core.documents import Document -from langchain_core.graph_vectorstores import Link - -from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor - -# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`. -KeybertInput = Union[str, Document] - - -class KeybertLinkExtractor(LinkExtractor[KeybertInput]): - def __init__( - self, - *, - kind: str = "kw", - embedding_model: str = "all-MiniLM-L6-v2", - extract_keywords_kwargs: Optional[Dict[str, Any]] = None, - ): - """Extract keywords using Keybert. - - Args: - kind: Kind of links to produce with this extractor. - embedding_model: Name of the embedding model to use with Keybert. - extract_keywords_kwargs: Keyword arguments to pass to Keybert's - `extract_keywords` method. - """ - try: - import keybert - - self._kw_model = keybert.KeyBERT(model=embedding_model) - except ImportError: - raise ImportError( - "keybert is required for KeybertLinkExtractor. " - "Please install it with `pip install keybert`." - ) from None - - self._kind = kind - self._extract_keywords_kwargs = extract_keywords_kwargs or {} - - def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002 - keywords = self._kw_model.extract_keywords( - input if isinstance(input, str) else input.page_content, - **self._extract_keywords_kwargs, - ) - return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords} - - def extract_many( - self, - inputs: Iterable[KeybertInput], - ) -> Iterable[Set[Link]]: - if len(inputs) == 1: - # Even though we pass a list, if it contains one item, keybert will - # flatten it. This means it's easier to just call the special case - # for one item. - yield self.extract_one(inputs[0]) - elif len(inputs) > 1: - strs = [i if isinstance(i, str) else i.page_content for i in inputs] - extracted = self._kw_model.extract_keywords( - strs, **self._extract_keywords_kwargs - ) - for keywords in extracted: - yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords} +from langchain_community.graph_vectorstores.extractors import ( + KeybertInput, + KeybertLinkExtractor, +) + +__all__ = [ + "KeybertInput", + "KeybertLinkExtractor", +] diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py index f4c3aa34d..380b14993 100644 --- a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py +++ b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py @@ -1,37 +1,5 @@ -from __future__ import annotations +from langchain_community.graph_vectorstores.extractors import LinkExtractor -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Generic, Iterable, Set, TypeVar - -if TYPE_CHECKING: - from langchain_core.graph_vectorstores import Link - -InputT = TypeVar("InputT") - -METADATA_LINKS_KEY = "links" - - -class LinkExtractor(ABC, Generic[InputT]): - """Interface for extracting links (incoming, outgoing, bidirectional).""" - - @abstractmethod - def extract_one(self, input: InputT) -> set[Link]: # noqa: A002 - """Add edges from each `input` to the corresponding documents. - - Args: - input: The input content to extract edges from. - - Returns: - Set of links extracted from the input. - """ - - def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]: - """Add edges from each `input` to the corresponding documents. - - Args: - inputs: The input content to extract edges from. - - Returns: - Iterable over the set of links extracted from the input. - """ - return map(self.extract_one, inputs) +__all__ = [ + "LinkExtractor", +] diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_adapter.py b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_adapter.py index d4fd7a35d..6a6d0f801 100644 --- a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_adapter.py +++ b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_adapter.py @@ -1,25 +1,5 @@ -from typing import Callable, Iterable, Set, TypeVar +from langchain_community.graph_vectorstores.extractors import LinkExtractorAdapter -from langchain_core.graph_vectorstores import Link - -from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor - -InputT = TypeVar("InputT") -UnderlyingInputT = TypeVar("UnderlyingInputT") - - -class LinkExtractorAdapter(LinkExtractor[InputT]): - def __init__( - self, - underlying: LinkExtractor[UnderlyingInputT], - transform: Callable[[InputT], UnderlyingInputT], - ) -> None: - self._underlying = underlying - self._transform = transform - - def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002 - return self.extract_one(self._transform(input)) - - def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]: - underlying_inputs = map(self._transform, inputs) - return self._underlying.extract_many(underlying_inputs) +__all__ = [ + "LinkExtractorAdapter", +] diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_transformer.py b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_transformer.py index 534f7fb69..2fb27e9c7 100644 --- a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_transformer.py +++ b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_transformer.py @@ -1,11 +1,10 @@ from typing import Iterable, Sequence +from langchain_community.graph_vectorstores.extractors import LinkExtractor from langchain_core.documents import Document from langchain_core.documents.transformers import BaseDocumentTransformer from langchain_core.graph_vectorstores.links import add_links -from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor - class LinkExtractorTransformer(BaseDocumentTransformer): def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]): diff --git a/libs/langchain/tests/unit_tests/test_gliner_link_extractor.py b/libs/langchain/tests/unit_tests/test_gliner_link_extractor.py index d521f1fcb..f3cd52122 100644 --- a/libs/langchain/tests/unit_tests/test_gliner_link_extractor.py +++ b/libs/langchain/tests/unit_tests/test_gliner_link_extractor.py @@ -1,5 +1,5 @@ +from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor from langchain_core.graph_vectorstores import Link -from ragstack_langchain.graph_store.extractors import GLiNERLinkExtractor PAGE_1 = """ Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃ'tjɐnu diff --git a/libs/langchain/tests/unit_tests/test_hierarchy_link_extractor.py b/libs/langchain/tests/unit_tests/test_hierarchy_link_extractor.py deleted file mode 100644 index 52494480d..000000000 --- a/libs/langchain/tests/unit_tests/test_hierarchy_link_extractor.py +++ /dev/null @@ -1,83 +0,0 @@ -from langchain_core.graph_vectorstores import Link -from ragstack_langchain.graph_store.extractors import HierarchyLinkExtractor - -PATH_1 = ["Root", "H1", "h2"] - -PATH_2 = ["Root", "H1"] - -PATH_3 = ["Root"] - - -def test_up_only(): - extractor = HierarchyLinkExtractor() - - assert extractor.extract_one(PATH_1) == { - # Path1 links up to Root/H1 - Link.outgoing(kind="hierarchy", tag="up:Root/H1"), - # Path1 is linked to by stuff under Root/H1/h2 - Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"), - } - - assert extractor.extract_one(PATH_2) == { - # Path2 links up to Root - Link.outgoing(kind="hierarchy", tag="up:Root"), - # Path2 is linked to by stuff under Root/H1/h2 - Link.incoming(kind="hierarchy", tag="up:Root/H1"), - } - - assert extractor.extract_one(PATH_3) == { - # Path3 is linked to by stuff under Root - Link.incoming(kind="hierarchy", tag="up:Root"), - } - - -def test_up_and_down(): - extractor = HierarchyLinkExtractor(down_links=True) - - assert extractor.extract_one(PATH_1) == { - # Path1 links up to Root/H1 - Link.outgoing(kind="hierarchy", tag="up:Root/H1"), - # Path1 is linked to by stuff under Root/H1/h2 - Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"), - # Path1 links down to things under Root/H1/h2. - Link.outgoing(kind="hierarchy", tag="down:Root/H1/h2"), - # Path1 is linked down to by Root/H1 - Link.incoming(kind="hierarchy", tag="down:Root/H1"), - } - - assert extractor.extract_one(PATH_2) == { - # Path2 links up to Root - Link.outgoing(kind="hierarchy", tag="up:Root"), - # Path2 is linked to by stuff under Root/H1/h2 - Link.incoming(kind="hierarchy", tag="up:Root/H1"), - # Path2 links down to things under Root/H1. - Link.outgoing(kind="hierarchy", tag="down:Root/H1"), - # Path2 is linked down to by Root - Link.incoming(kind="hierarchy", tag="down:Root"), - } - - assert extractor.extract_one(PATH_3) == { - # Path3 is linked to by stuff under Root - Link.incoming(kind="hierarchy", tag="up:Root"), - # Path3 links down to things under Root/H1. - Link.outgoing(kind="hierarchy", tag="down:Root"), - } - - -def test_sibling(): - extractor = HierarchyLinkExtractor(sibling_links=True, up_links=False) - - assert extractor.extract_one(PATH_1) == { - # Path1 links with anything else in Root/H1 - Link.bidir(kind="hierarchy", tag="sib:Root/H1"), - } - - assert extractor.extract_one(PATH_2) == { - # Path2 links with anything else in Root - Link.bidir(kind="hierarchy", tag="sib:Root"), - } - - assert extractor.extract_one(PATH_3) == { - # Path3 links with anything else at the top level - Link.bidir(kind="hierarchy", tag="sib:"), - } diff --git a/libs/langchain/tests/unit_tests/test_html_link_extractor.py b/libs/langchain/tests/unit_tests/test_html_link_extractor.py deleted file mode 100644 index 0b008e228..000000000 --- a/libs/langchain/tests/unit_tests/test_html_link_extractor.py +++ /dev/null @@ -1,106 +0,0 @@ -from bs4 import BeautifulSoup -from langchain_core.graph_vectorstores import Link -from ragstack_langchain.graph_store.extractors import HtmlInput, HtmlLinkExtractor - -PAGE_1 = """ - - -Hello. -Relative -Relative base. -Aboslute -Test - - -""" - -PAGE_2 = """ - - -Hello. -Relative - -""" - - -def test_one_from_str(): - extractor = HtmlLinkExtractor() - - results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/")) - assert results == { - Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), - Link.outgoing(kind="hyperlink", tag="http://cnn.com"), - Link.outgoing(kind="hyperlink", tag="https://same.foo"), - } - - results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/")) - assert results == { - Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"), - Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"), - Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"), - Link.outgoing(kind="hyperlink", tag="http://cnn.com"), - Link.outgoing(kind="hyperlink", tag="http://same.foo"), - } - - -def test_one_from_beautiful_soup(): - extractor = HtmlLinkExtractor() - soup = BeautifulSoup(PAGE_1, "html.parser") - results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/")) - assert results == { - Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), - Link.outgoing(kind="hyperlink", tag="http://cnn.com"), - Link.outgoing(kind="hyperlink", tag="https://same.foo"), - } - - -def test_drop_fragments(): - extractor = HtmlLinkExtractor(drop_fragments=True) - results = extractor.extract_one( - HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment") - ) - - assert results == { - Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"), - } - - -def test_include_fragments(): - extractor = HtmlLinkExtractor(drop_fragments=False) - results = extractor.extract_one( - HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment") - ) - - assert results == { - Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"), - } - - -def test_batch_from_str(): - extractor = HtmlLinkExtractor() - results = list( - extractor.extract_many( - [ - HtmlInput(PAGE_1, base_url="https://foo.com/bar/"), - HtmlInput(PAGE_2, base_url="https://foo.com/baz/"), - ] - ) - ) - - assert results[0] == { - Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), - Link.outgoing(kind="hyperlink", tag="http://cnn.com"), - Link.outgoing(kind="hyperlink", tag="https://same.foo"), - } - assert results[1] == { - Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"), - Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"), - } diff --git a/libs/langchain/tests/unit_tests/test_keybert_link_extractor.py b/libs/langchain/tests/unit_tests/test_keybert_link_extractor.py index b58fdb275..947c1f117 100644 --- a/libs/langchain/tests/unit_tests/test_keybert_link_extractor.py +++ b/libs/langchain/tests/unit_tests/test_keybert_link_extractor.py @@ -1,5 +1,5 @@ +from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor from langchain_core.graph_vectorstores import Link -from ragstack_langchain.graph_store.extractors import KeybertLinkExtractor PAGE_1 = """ Supervised learning is the machine learning task of learning a function that diff --git a/libs/langchain/tests/unit_tests/test_link_extractor_transformer.py b/libs/langchain/tests/unit_tests/test_link_extractor_transformer.py index 5bcc0e6dc..9902c65b7 100644 --- a/libs/langchain/tests/unit_tests/test_link_extractor_transformer.py +++ b/libs/langchain/tests/unit_tests/test_link_extractor_transformer.py @@ -1,22 +1,37 @@ -from langchain_core.documents import Document -from langchain_core.graph_vectorstores.links import Link, get_links -from ragstack_langchain.graph_store.extractors import ( - HtmlLinkExtractor, - LinkExtractorTransformer, -) -from ragstack_langchain.graph_store.extractors.gliner_link_extractor import ( +from langchain_community.graph_vectorstores.extractors import ( GLiNERLinkExtractor, -) -from ragstack_langchain.graph_store.extractors.keybert_link_extractor import ( + HtmlLinkExtractor, KeybertLinkExtractor, ) +from langchain_core.documents import Document +from langchain_core.graph_vectorstores.links import Link, get_links +from ragstack_langchain.graph_store.extractors import LinkExtractorTransformer from . import ( test_gliner_link_extractor, - test_html_link_extractor, test_keybert_link_extractor, ) +PAGE_1 = """ + + +Hello. +Relative +Relative base. +Aboslute +Test + + +""" + +PAGE_2 = """ + + +Hello. +Relative + +""" + def test_html_extractor(): transformer = LinkExtractorTransformer( @@ -25,13 +40,13 @@ def test_html_extractor(): ] ) doc1 = Document( - page_content=test_html_link_extractor.PAGE_1, + page_content=PAGE_1, metadata={ "source": "https://foo.com/bar/", }, ) doc2 = Document( - page_content=test_html_link_extractor.PAGE_2, + page_content=PAGE_2, metadata={ "source": "https://foo.com/baz/", },