diff --git a/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py b/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py
index 4325a248c..250ecced4 100644
--- a/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py
+++ b/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py
@@ -15,7 +15,7 @@
import urllib.error
import requests
-from requests.packages.urllib3.util.retry import Retry
+from urllib3.util.retry import Retry
from threatexchange.exchanges.clients.utils.common import TimeoutHTTPAdapter
diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py
index 316b45861..e534fdd08 100644
--- a/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py
+++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py
@@ -9,6 +9,7 @@
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
+from copy import deepcopy
from dataclasses import dataclass
from enum import Enum, unique
import logging
@@ -19,7 +20,7 @@
import urllib.parse
import requests
-from requests.packages.urllib3.util.retry import Retry
+from urllib3.util.retry import Retry
from threatexchange.exchanges.clients.utils.common import TimeoutHTTPAdapter
@@ -113,24 +114,129 @@ def str(self, key: str) -> str:
@dataclass
class StatusResult:
+ """
+ Represents a NCMEC member.
+
+ While it does correspond to the /status endpoint, we should probably
+ rename it at this point.
+ """
+
esp_id: int
esp_name: str
+ @classmethod
+ def from_xml(cls, xml: _XMLWrapper) -> "StatusResult":
+ return cls(esp_id=xml.int("id"), esp_name=xml.text)
+
@unique
class NCMECEntryType(Enum):
+ """Type of entry, as marked by xml"""
+
image = "image"
video = "video"
+@unique
+class FingerprintType(Enum):
+ """
+ The list of supported fingerprints, as of 10/2024.
+
+ This also corresponds to the feedback types for the upvote/downvote API
+
+ We are currently not parsing these in the returned entry to prevent
+ compatibility issues if NCMEC were to add more fingerprint types, and
+ returning them as simple strings.
+ """
+
+ md5 = "MD5"
+ sha1 = "SHA1"
+ pdna = "PDNA"
+ pdq = "PDQ"
+ netclean = "NETCLEAN"
+ videntifier = "VIDENTIFIER"
+ tmk_pdqf = "TMK_PDQF"
+ ssvh_pdna = "SSVH_PDNA"
+ ssvh_safer_hash = "SSVH_SAFER_HASH"
+
+
+@dataclass
+class Feedback:
+ """Feedback on a single fingerprint in an entry"""
+
+ # True = upvote | False = downvote
+ sentiment: bool
+ # The member giving the feedback
+ member: StatusResult
+ # For upvotes, the reason text
+ reason: str = ""
+
+ @classmethod
+ def get_from_entry_feedback(
+ cls, entry: _XMLWrapper
+ ) -> t.Dict[str, t.List["Feedback"]]:
+ feedbacks_xml = entry.maybe("feedback")
+ if not feedbacks_xml:
+ return {}
+ ret: t.Dict[str, t.List[Feedback]] = {}
+
+ for sentimentTag in feedbacks_xml:
+ feedbacks = ret.setdefault(sentimentTag.str("type"), [])
+ if sentimentTag.tag == "affirmativeFeedback":
+ # Iterate over members
+ for m in sentimentTag.maybe("members"):
+ feedbacks.append(cls(True, StatusResult.from_xml(m)))
+ elif sentimentTag.tag == "negativeFeedback":
+ # It's impossible to tell from the public documentation how
+ # to parse this correctly because it's ambigous how it's
+ # formatted.
+ reason_block = list(sentimentTag.maybe("reasons"))
+ for i in range(0, len(reason_block), 2):
+ if i + 1 >= len(reason_block):
+ logging.warning("[ncmec] reason block has odd number of blocks")
+ continue
+ reason = reason_block[i]
+ members = reason_block[i + 1]
+ if reason.tag != "reason" or members.tag != "members":
+ logging.warning(
+ "[ncmec] reason block malformed: reason:%s remembers:%s",
+ reason.tag,
+ members.tag,
+ )
+ continue
+
+ reason_name = reason.str("name")
+ for m in members:
+ feedbacks.append(
+ cls(False, StatusResult.from_xml(m), reason_name)
+ )
+ else:
+ logging.warning(
+ "[ncmec] Ignoring unknown sentiment '%s'", sentimentTag.tag
+ )
+ continue
+ return ret
+
+
@dataclass
class NCMECEntryUpdate:
+ # The entry id
id: str
+ # The esp_id for the uploader
member_id: int
+ # The entry or content type (e.g. image/video)
entry_type: NCMECEntryType
+ # Whether or not this is a tombstone for a deleted entry
deleted: bool
+ # The string classification of the entry
classification: t.Optional[str]
+ # The hashes/fingerprints for this entry, Dict[type, value]. This is
+ # roughly equivalent to SignalType.get_name(): signal_value, but NCMEC
+ # chooses different names for these
fingerprints: t.Dict[str, str]
+ # The feedback (upvote/downvote) that other ESPs have given for this entry
+ # Keyed the same way as fingerprints
+ feedback: t.Dict[str, t.List[Feedback]]
@classmethod
def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate":
@@ -148,6 +254,7 @@ def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate":
fingerprints={
x.tag: x.text for x in xml.maybe("fingerprints") if x.has_text
},
+ feedback=Feedback.get_from_entry_feedback(xml),
)
@@ -215,11 +322,31 @@ def estimated_entries_in_range(self) -> int:
)
+# TODO: once we know the shape of response, finish this class
+@dataclass
+class UpdateEntryResponse:
+ updates: t.List[NCMECEntryUpdate]
+
+ @classmethod
+ def from_xml(
+ cls, xml: _XMLWrapper, fallback_max_time: int
+ ) -> "UpdateEntryResponse":
+ updates: t.List[NCMECEntryUpdate] = []
+
+ for content_xml in (xml.maybe("images"), xml.maybe("videos")):
+ if not content_xml or not len(content_xml):
+ continue
+ updates.extend(NCMECEntryUpdate.from_xml(c) for c in content_xml)
+
+ return cls(updates)
+
+
@unique
class NCMECEndpoint(Enum):
status = "status"
entries = "entries"
members = "members"
+ feedback = "feedback"
class NCMECEnvironment(Enum):
@@ -266,10 +393,13 @@ def __init__(
self.username = username
self.password = password
self._base_url = environment.value
+ self._my_esp: t.Optional[StatusResult] = None
+ # type -> name -> guid
+ self._feedback_reason_map: t.Dict[FingerprintType, t.Dict[str, str]] = {}
def _get_session(self) -> requests.Session:
"""
- Custom requests sesson
+ Custom requests session
Ideally, should be used within a context manager:
```
@@ -295,7 +425,9 @@ def _get_session(self) -> requests.Session:
)
return session
- def _get(self, endpoint: NCMECEndpoint, *, next_: str = "", **params) -> ET.Element:
+ def _get(
+ self, endpoint: NCMECEndpoint, *, path: str = "", next_: str = "", **params
+ ) -> ET.Element:
"""
Perform an HTTP GET request, and return the XML response payload.
@@ -303,6 +435,8 @@ def _get(self, endpoint: NCMECEndpoint, *, next_: str = "", **params) -> ET.Elem
"""
url = "/".join((self._base_url, self.VERSION, endpoint.value))
+ if path:
+ url = "/".join((url, path))
if next_:
url = self._base_url + next_
params = {}
@@ -328,25 +462,71 @@ def _post(self, endpoint: NCMECEndpoint, *, data=None) -> t.Any:
No timeout or retry strategy.
"""
- url = "/".join((self._base_url, endpoint.value))
+ url = "/".join((self._base_url, self.VERSION, endpoint.value))
with self._get_session() as session:
response = session.post(url, data=data)
response.raise_for_status()
return response
+ def _put(
+ self,
+ endpoint: NCMECEndpoint,
+ *,
+ member_id: t.Optional[int] = None,
+ entry_id: t.Optional[str] = None,
+ feedback_type: t.Optional[FingerprintType] = None,
+ data=None,
+ ) -> t.Any:
+ """
+ Perform an HTTP PUT request, and return the XML response payload.
+
+ No timeout or retry strategy.
+ """
+
+ url = "/".join((self._base_url, self.VERSION, endpoint.value))
+ if feedback_type and member_id and entry_id:
+ url = "/".join(
+ (
+ self._base_url,
+ endpoint.value,
+ str(member_id),
+ entry_id,
+ feedback_type.value,
+ NCMECEndpoint.feedback.value,
+ )
+ )
+ with self._get_session() as session:
+ response = session.put(url, data=data)
+ response.raise_for_status()
+ return response
+
def status(self) -> StatusResult:
"""Query the status endpoint, which tells you who you are."""
response = self._get(NCMECEndpoint.status)
- member = _XMLWrapper(response)["member"]
- return StatusResult(member.int("id"), member.text)
+ ret = StatusResult.from_xml(_XMLWrapper(response)["member"])
+ self._my_esp = deepcopy(ret)
+ return ret
def members(self) -> t.List[StatusResult]:
"""Query the members endpoint, which gives you a list of esps"""
response = self._get(NCMECEndpoint.members)
- return [
- StatusResult(member.int("id"), member.text)
- for member in _XMLWrapper(response)
- ]
+ return [StatusResult.from_xml(member) for member in _XMLWrapper(response)]
+
+ def feedback_reasons(self, fingerprint_type: FingerprintType) -> t.Dict[str, str]:
+ """
+ Get the possible negative feedback reasons for this type
+
+ According to NCMEC documentation, the GUIDs
+ """
+ xml = _XMLWrapper(
+ self._get(NCMECEndpoint.feedback, path=f"{fingerprint_type.value}/reasons")
+ )
+ ret = {
+ reason.str("guid"): reason.str("name")
+ for reason in xml["availableFeedbackReasons"]
+ }
+ self._feedback_reason_map[fingerprint_type] = deepcopy(ret)
+ return ret
def get_entries(
self,
@@ -401,6 +581,54 @@ def get_entries_iter(
has_more = bool(next_)
yield result
+ def submit_feedback(
+ self,
+ entry_id: str,
+ fingerprint_type: FingerprintType,
+ affirmative: bool,
+ negative_reason_guid: t.Optional[str] = None,
+ ) -> None:
+
+ # need member_id to submit feedback
+ my_esp = self._my_esp
+ if my_esp is None:
+ my_esp = self.status()
+
+ # Prepare the XML payload
+ root = ET.Element("feedbackSubmission")
+ root.set("xmlns", "https://hashsharing.ncmec.org/hashsharing/v2")
+ vote = ET.SubElement(root, "affirmative" if affirmative else "negative")
+
+ if not affirmative:
+ if not negative_reason_guid:
+ # We need a reason ID, but there may be only one choice
+ # so we can just use that one
+ if fingerprint_type not in self._feedback_reason_map:
+ self.feedback_reasons(fingerprint_type)
+ feedback_options = self._feedback_reason_map[fingerprint_type]
+ if not feedback_options:
+ raise Exception(
+ "No feedback options for this type? Try reaching out to NCMEC"
+ )
+ if len(feedback_options) == 1:
+ # Only one choice
+ negative_reason_guid = next(iter(feedback_options.keys()))
+ if not negative_reason_guid:
+ raise Exception(
+ f"Need to pick a feedback reason. Options: {feedback_options}"
+ )
+ reasons = ET.SubElement(vote, "reasonIds")
+ guid = ET.SubElement(reasons, "guid")
+ guid.text = negative_reason_guid
+
+ self._put(
+ NCMECEndpoint.entries,
+ member_id=my_esp.esp_id,
+ entry_id=entry_id,
+ feedback_type=fingerprint_type,
+ data=ET.tostring(root),
+ )
+
def _date_format(timestamp: int) -> str:
"""ISO 8601 format yyyy-MM-dd'T'HH:mm:ss.SSSZ"""
diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py
new file mode 100644
index 000000000..21b7a9ca5
--- /dev/null
+++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py
@@ -0,0 +1,220 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+STATUS_XML = """
+
+
+ 127.0.0.1
+ testington
+ Sir Testington
+
+""".strip()
+
+NEXT_UNESCAPED = (
+ "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z"
+ "&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000"
+)
+
+NEXT_UNESCAPED2 = (
+ "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z"
+ "&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000"
+)
+NEXT_UNESCAPED3 = (
+ "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z"
+ "&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000"
+)
+
+ENTRIES_XML = """
+
+
+
+
+ Example Member
+ 2017-10-24T15:00:00Z
+ image1
+ A1
+
+ a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
+ a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
+ a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1...
+
+
+
+
+ Example Member
+
+
+
+
+
+
+ Example Member
+
+
+
+
+
+
+ Example Member2
+ image4
+ 2017-10-24T15:10:00Z
+
+
+
+
+
+ Example Member
+ video4
+ 2017-10-24T15:20:00Z
+
+
+
+ /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000
+
+
+""".strip()
+
+
+ENTRIES_XML2 = """
+
+
+
+
+ Example Member
+ 2019-10-24T15:00:00Z
+ image10
+ A1
+
+ b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
+ b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
+ b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1...
+
+
+
+
+
+ /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000
+
+
+""".strip()
+
+# This example isn't in the documentation, but shows how updates work
+ENTRIES_XML3 = """
+
+
+
+
+
+
+
+ /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000
+
+
+""".strip()
+
+ENTRIES_XML4 = """
+
+
+
+
+
+ TX Example
+ 2019-11-25T15:10:00Z
+ willdelete
+
+
+
+""".strip()
+
+ENTRIES_LARGE_FINGERPRINTS = """
+
+
+
+
+
+
+""".strip()
+
+STATUS_XML = """
+
+
+ 1.1.1.1
+ test_user
+ test member
+
+""".strip()
+
+FEEDBACK_REASONS_XML = """
+
+
+
+
+""".strip()
+
+AFFIRMATIVE_FEEDBACK_XML = """
+
+
+
+
+
+
+""".strip()
+
+NEGATIVE_FEEDBACK_XML = """
+
+
+
+
+ 01234567-abcd-0123-4567-012345678900
+
+
+
+""".strip()
+
+UPDATE_FEEDBACK_RESULT_XML = """
+
+
+
+
+""".strip()
diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py
index 89968543a..a12330091 100644
--- a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py
+++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py
@@ -1,186 +1,41 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
from unittest.mock import Mock
-import urllib.parse
import typing as t
import pytest
import requests
from threatexchange.exchanges.clients.ncmec.hash_api import (
NCMECEntryType,
NCMECEntryUpdate,
+ FingerprintType,
NCMECHashAPI,
NCMECEnvironment,
+ StatusResult,
)
-
-STATUS_XML = """
-
-
- 127.0.0.1
- testington
- Sir Testington
-
-""".strip()
-
-NEXT_UNESCAPED = (
- "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z"
- "&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000"
-)
-
-NEXT_UNESCAPED2 = (
- "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z"
- "&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000"
+from threatexchange.exchanges.clients.ncmec.tests.data import (
+ ENTRIES_LARGE_FINGERPRINTS,
+ ENTRIES_XML,
+ ENTRIES_XML2,
+ ENTRIES_XML3,
+ ENTRIES_XML4,
+ NEXT_UNESCAPED,
+ NEXT_UNESCAPED2,
+ NEXT_UNESCAPED3,
+ STATUS_XML,
+ UPDATE_FEEDBACK_RESULT_XML,
)
-NEXT_UNESCAPED3 = (
- "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z"
- "&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000"
-)
-
-ENTRIES_XML = """
-
-
-
-
- Example Member
- 2017-10-24T15:00:00Z
- image1
- A1
-
- a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
- a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
- a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1...
-
-
-
- Example Member2
- image4
- 2017-10-24T15:10:00Z
-
-
-
-
-
- Example Member
- video4
- 2017-10-24T15:20:00Z
-
-
-
- /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000
-
-
-""".strip()
-
-
-ENTRIES_XML2 = """
-
-
-
-
- Example Member
- 2019-10-24T15:00:00Z
- image10
- A1
-
- b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
- b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1
- b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1...
-
-
-
-
- /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000
-
-
-""".strip()
-
-# This example isn't in the documentation, but shows how updates work
-ENTRIES_XML3 = """
-
-
-
-
-
-
-
- /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000
-
-
-""".strip()
-
-ENTRIES_XML4 = """
-
-
-
-
-
- TX Example
- 2019-11-25T15:10:00Z
- willdelete
-
-
-
-""".strip()
-
-ENTRIES_LARGE_FINGERPRINTS = """
-
-
-
-
-
-
-""".strip()
def mock_get_impl(url: str, **params):
content = ENTRIES_XML
if url.endswith(NEXT_UNESCAPED):
content = ENTRIES_XML2
- if url.endswith(NEXT_UNESCAPED2):
+ elif url.endswith(NEXT_UNESCAPED2):
content = ENTRIES_XML3
- if url.endswith(NEXT_UNESCAPED3):
+ elif url.endswith(NEXT_UNESCAPED3):
content = ENTRIES_XML4
+ elif url.endswith("/status"):
+ content = STATUS_XML
# Void your warantee by messing with requests state
resp = requests.Response()
resp._content = content.encode()
@@ -216,6 +71,7 @@ def api(monkeypatch: pytest.MonkeyPatch):
session = Mock(
strict_spec=["get", "__enter__", "__exit__"],
get=mock_get_impl,
+ _put=Mock(),
__enter__=lambda _: session,
__exit__=lambda *args: None,
)
@@ -274,6 +130,14 @@ def assert_fifth_entry(entry: NCMECEntryUpdate) -> None:
}
+def test_mocked_status(api: NCMECHashAPI):
+ assert api._my_esp is None
+ result = api.status()
+ assert result.esp_id == 1
+ assert result.esp_name == "test member"
+ assert result == api._my_esp
+
+
def test_mocked_get_hashes(api: NCMECHashAPI):
result = api.get_entries()
@@ -323,3 +187,15 @@ def test_large_fingerprint_entries(monkeypatch):
assert len(update.fingerprints) == 1
assert update.fingerprints == {"md5": "facefacefacefacefacefacefaceface"}
assert result.next == ""
+
+
+def test_feedback_entries(api: NCMECHashAPI):
+ # We'll mock that we've already read our own ESP
+
+ api.submit_feedback("image1", FingerprintType.md5, True)
+ api.submit_feedback(
+ "image1",
+ FingerprintType.md5,
+ False,
+ "01234567-abcd-0123-4567-012345678900",
+ )
diff --git a/python-threatexchange/threatexchange/signal_type/signal_base.py b/python-threatexchange/threatexchange/signal_type/signal_base.py
index 3e889efa5..393cf6339 100644
--- a/python-threatexchange/threatexchange/signal_type/signal_base.py
+++ b/python-threatexchange/threatexchange/signal_type/signal_base.py
@@ -6,7 +6,6 @@
import abc
import pathlib
-import random
import typing as t
from threatexchange import common