diff --git a/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py b/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py index 4325a248c..250ecced4 100644 --- a/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py +++ b/python-threatexchange/threatexchange/exchanges/clients/fb_threatexchange/api.py @@ -15,7 +15,7 @@ import urllib.error import requests -from requests.packages.urllib3.util.retry import Retry +from urllib3.util.retry import Retry from threatexchange.exchanges.clients.utils.common import TimeoutHTTPAdapter diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py index 316b45861..e534fdd08 100644 --- a/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py +++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py @@ -9,6 +9,7 @@ import xml.etree.ElementTree as ET from datetime import datetime, timezone +from copy import deepcopy from dataclasses import dataclass from enum import Enum, unique import logging @@ -19,7 +20,7 @@ import urllib.parse import requests -from requests.packages.urllib3.util.retry import Retry +from urllib3.util.retry import Retry from threatexchange.exchanges.clients.utils.common import TimeoutHTTPAdapter @@ -113,24 +114,129 @@ def str(self, key: str) -> str: @dataclass class StatusResult: + """ + Represents a NCMEC member. + + While it does correspond to the /status endpoint, we should probably + rename it at this point. + """ + esp_id: int esp_name: str + @classmethod + def from_xml(cls, xml: _XMLWrapper) -> "StatusResult": + return cls(esp_id=xml.int("id"), esp_name=xml.text) + @unique class NCMECEntryType(Enum): + """Type of entry, as marked by xml""" + image = "image" video = "video" +@unique +class FingerprintType(Enum): + """ + The list of supported fingerprints, as of 10/2024. + + This also corresponds to the feedback types for the upvote/downvote API + + We are currently not parsing these in the returned entry to prevent + compatibility issues if NCMEC were to add more fingerprint types, and + returning them as simple strings. + """ + + md5 = "MD5" + sha1 = "SHA1" + pdna = "PDNA" + pdq = "PDQ" + netclean = "NETCLEAN" + videntifier = "VIDENTIFIER" + tmk_pdqf = "TMK_PDQF" + ssvh_pdna = "SSVH_PDNA" + ssvh_safer_hash = "SSVH_SAFER_HASH" + + +@dataclass +class Feedback: + """Feedback on a single fingerprint in an entry""" + + # True = upvote | False = downvote + sentiment: bool + # The member giving the feedback + member: StatusResult + # For upvotes, the reason text + reason: str = "" + + @classmethod + def get_from_entry_feedback( + cls, entry: _XMLWrapper + ) -> t.Dict[str, t.List["Feedback"]]: + feedbacks_xml = entry.maybe("feedback") + if not feedbacks_xml: + return {} + ret: t.Dict[str, t.List[Feedback]] = {} + + for sentimentTag in feedbacks_xml: + feedbacks = ret.setdefault(sentimentTag.str("type"), []) + if sentimentTag.tag == "affirmativeFeedback": + # Iterate over members + for m in sentimentTag.maybe("members"): + feedbacks.append(cls(True, StatusResult.from_xml(m))) + elif sentimentTag.tag == "negativeFeedback": + # It's impossible to tell from the public documentation how + # to parse this correctly because it's ambigous how it's + # formatted. + reason_block = list(sentimentTag.maybe("reasons")) + for i in range(0, len(reason_block), 2): + if i + 1 >= len(reason_block): + logging.warning("[ncmec] reason block has odd number of blocks") + continue + reason = reason_block[i] + members = reason_block[i + 1] + if reason.tag != "reason" or members.tag != "members": + logging.warning( + "[ncmec] reason block malformed: reason:%s remembers:%s", + reason.tag, + members.tag, + ) + continue + + reason_name = reason.str("name") + for m in members: + feedbacks.append( + cls(False, StatusResult.from_xml(m), reason_name) + ) + else: + logging.warning( + "[ncmec] Ignoring unknown sentiment '%s'", sentimentTag.tag + ) + continue + return ret + + @dataclass class NCMECEntryUpdate: + # The entry id id: str + # The esp_id for the uploader member_id: int + # The entry or content type (e.g. image/video) entry_type: NCMECEntryType + # Whether or not this is a tombstone for a deleted entry deleted: bool + # The string classification of the entry classification: t.Optional[str] + # The hashes/fingerprints for this entry, Dict[type, value]. This is + # roughly equivalent to SignalType.get_name(): signal_value, but NCMEC + # chooses different names for these fingerprints: t.Dict[str, str] + # The feedback (upvote/downvote) that other ESPs have given for this entry + # Keyed the same way as fingerprints + feedback: t.Dict[str, t.List[Feedback]] @classmethod def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate": @@ -148,6 +254,7 @@ def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate": fingerprints={ x.tag: x.text for x in xml.maybe("fingerprints") if x.has_text }, + feedback=Feedback.get_from_entry_feedback(xml), ) @@ -215,11 +322,31 @@ def estimated_entries_in_range(self) -> int: ) +# TODO: once we know the shape of response, finish this class +@dataclass +class UpdateEntryResponse: + updates: t.List[NCMECEntryUpdate] + + @classmethod + def from_xml( + cls, xml: _XMLWrapper, fallback_max_time: int + ) -> "UpdateEntryResponse": + updates: t.List[NCMECEntryUpdate] = [] + + for content_xml in (xml.maybe("images"), xml.maybe("videos")): + if not content_xml or not len(content_xml): + continue + updates.extend(NCMECEntryUpdate.from_xml(c) for c in content_xml) + + return cls(updates) + + @unique class NCMECEndpoint(Enum): status = "status" entries = "entries" members = "members" + feedback = "feedback" class NCMECEnvironment(Enum): @@ -266,10 +393,13 @@ def __init__( self.username = username self.password = password self._base_url = environment.value + self._my_esp: t.Optional[StatusResult] = None + # type -> name -> guid + self._feedback_reason_map: t.Dict[FingerprintType, t.Dict[str, str]] = {} def _get_session(self) -> requests.Session: """ - Custom requests sesson + Custom requests session Ideally, should be used within a context manager: ``` @@ -295,7 +425,9 @@ def _get_session(self) -> requests.Session: ) return session - def _get(self, endpoint: NCMECEndpoint, *, next_: str = "", **params) -> ET.Element: + def _get( + self, endpoint: NCMECEndpoint, *, path: str = "", next_: str = "", **params + ) -> ET.Element: """ Perform an HTTP GET request, and return the XML response payload. @@ -303,6 +435,8 @@ def _get(self, endpoint: NCMECEndpoint, *, next_: str = "", **params) -> ET.Elem """ url = "/".join((self._base_url, self.VERSION, endpoint.value)) + if path: + url = "/".join((url, path)) if next_: url = self._base_url + next_ params = {} @@ -328,25 +462,71 @@ def _post(self, endpoint: NCMECEndpoint, *, data=None) -> t.Any: No timeout or retry strategy. """ - url = "/".join((self._base_url, endpoint.value)) + url = "/".join((self._base_url, self.VERSION, endpoint.value)) with self._get_session() as session: response = session.post(url, data=data) response.raise_for_status() return response + def _put( + self, + endpoint: NCMECEndpoint, + *, + member_id: t.Optional[int] = None, + entry_id: t.Optional[str] = None, + feedback_type: t.Optional[FingerprintType] = None, + data=None, + ) -> t.Any: + """ + Perform an HTTP PUT request, and return the XML response payload. + + No timeout or retry strategy. + """ + + url = "/".join((self._base_url, self.VERSION, endpoint.value)) + if feedback_type and member_id and entry_id: + url = "/".join( + ( + self._base_url, + endpoint.value, + str(member_id), + entry_id, + feedback_type.value, + NCMECEndpoint.feedback.value, + ) + ) + with self._get_session() as session: + response = session.put(url, data=data) + response.raise_for_status() + return response + def status(self) -> StatusResult: """Query the status endpoint, which tells you who you are.""" response = self._get(NCMECEndpoint.status) - member = _XMLWrapper(response)["member"] - return StatusResult(member.int("id"), member.text) + ret = StatusResult.from_xml(_XMLWrapper(response)["member"]) + self._my_esp = deepcopy(ret) + return ret def members(self) -> t.List[StatusResult]: """Query the members endpoint, which gives you a list of esps""" response = self._get(NCMECEndpoint.members) - return [ - StatusResult(member.int("id"), member.text) - for member in _XMLWrapper(response) - ] + return [StatusResult.from_xml(member) for member in _XMLWrapper(response)] + + def feedback_reasons(self, fingerprint_type: FingerprintType) -> t.Dict[str, str]: + """ + Get the possible negative feedback reasons for this type + + According to NCMEC documentation, the GUIDs + """ + xml = _XMLWrapper( + self._get(NCMECEndpoint.feedback, path=f"{fingerprint_type.value}/reasons") + ) + ret = { + reason.str("guid"): reason.str("name") + for reason in xml["availableFeedbackReasons"] + } + self._feedback_reason_map[fingerprint_type] = deepcopy(ret) + return ret def get_entries( self, @@ -401,6 +581,54 @@ def get_entries_iter( has_more = bool(next_) yield result + def submit_feedback( + self, + entry_id: str, + fingerprint_type: FingerprintType, + affirmative: bool, + negative_reason_guid: t.Optional[str] = None, + ) -> None: + + # need member_id to submit feedback + my_esp = self._my_esp + if my_esp is None: + my_esp = self.status() + + # Prepare the XML payload + root = ET.Element("feedbackSubmission") + root.set("xmlns", "https://hashsharing.ncmec.org/hashsharing/v2") + vote = ET.SubElement(root, "affirmative" if affirmative else "negative") + + if not affirmative: + if not negative_reason_guid: + # We need a reason ID, but there may be only one choice + # so we can just use that one + if fingerprint_type not in self._feedback_reason_map: + self.feedback_reasons(fingerprint_type) + feedback_options = self._feedback_reason_map[fingerprint_type] + if not feedback_options: + raise Exception( + "No feedback options for this type? Try reaching out to NCMEC" + ) + if len(feedback_options) == 1: + # Only one choice + negative_reason_guid = next(iter(feedback_options.keys())) + if not negative_reason_guid: + raise Exception( + f"Need to pick a feedback reason. Options: {feedback_options}" + ) + reasons = ET.SubElement(vote, "reasonIds") + guid = ET.SubElement(reasons, "guid") + guid.text = negative_reason_guid + + self._put( + NCMECEndpoint.entries, + member_id=my_esp.esp_id, + entry_id=entry_id, + feedback_type=fingerprint_type, + data=ET.tostring(root), + ) + def _date_format(timestamp: int) -> str: """ISO 8601 format yyyy-MM-dd'T'HH:mm:ss.SSSZ""" diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py new file mode 100644 index 000000000..21b7a9ca5 --- /dev/null +++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py @@ -0,0 +1,220 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. + +STATUS_XML = """ + + + 127.0.0.1 + testington + Sir Testington + +""".strip() + +NEXT_UNESCAPED = ( + "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" + "&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000" +) + +NEXT_UNESCAPED2 = ( + "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" + "&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000" +) +NEXT_UNESCAPED3 = ( + "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" + "&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000" +) + +ENTRIES_XML = """ + + + + + Example Member + 2017-10-24T15:00:00Z + image1 + A1 + + a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... + + + + + Example Member + + + + + + + Example Member + + + + + + + Example Member2 + image4 + 2017-10-24T15:10:00Z + + + + + + Example Member + video4 + 2017-10-24T15:20:00Z + + + + /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000 + + +""".strip() + + +ENTRIES_XML2 = """ + + + + + Example Member + 2019-10-24T15:00:00Z + image10 + A1 + + b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... + + + + + + /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000 + + +""".strip() + +# This example isn't in the documentation, but shows how updates work +ENTRIES_XML3 = """ + + + + + + + + /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000 + + +""".strip() + +ENTRIES_XML4 = """ + + + + + + TX Example + 2019-11-25T15:10:00Z + willdelete + + + +""".strip() + +ENTRIES_LARGE_FINGERPRINTS = """ + + + + + + +""".strip() + +STATUS_XML = """ + + + 1.1.1.1 + test_user + test member + +""".strip() + +FEEDBACK_REASONS_XML = """ + + + + +""".strip() + +AFFIRMATIVE_FEEDBACK_XML = """ + + + + + + +""".strip() + +NEGATIVE_FEEDBACK_XML = """ + + + + + 01234567-abcd-0123-4567-012345678900 + + + +""".strip() + +UPDATE_FEEDBACK_RESULT_XML = """ + + + + +""".strip() diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py index 89968543a..a12330091 100644 --- a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py +++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py @@ -1,186 +1,41 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. from unittest.mock import Mock -import urllib.parse import typing as t import pytest import requests from threatexchange.exchanges.clients.ncmec.hash_api import ( NCMECEntryType, NCMECEntryUpdate, + FingerprintType, NCMECHashAPI, NCMECEnvironment, + StatusResult, ) - -STATUS_XML = """ - - - 127.0.0.1 - testington - Sir Testington - -""".strip() - -NEXT_UNESCAPED = ( - "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" - "&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000" -) - -NEXT_UNESCAPED2 = ( - "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" - "&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000" +from threatexchange.exchanges.clients.ncmec.tests.data import ( + ENTRIES_LARGE_FINGERPRINTS, + ENTRIES_XML, + ENTRIES_XML2, + ENTRIES_XML3, + ENTRIES_XML4, + NEXT_UNESCAPED, + NEXT_UNESCAPED2, + NEXT_UNESCAPED3, + STATUS_XML, + UPDATE_FEEDBACK_RESULT_XML, ) -NEXT_UNESCAPED3 = ( - "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" - "&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000" -) - -ENTRIES_XML = """ - - - - - Example Member - 2017-10-24T15:00:00Z - image1 - A1 - - a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... - - - - Example Member2 - image4 - 2017-10-24T15:10:00Z - - - - - - Example Member - video4 - 2017-10-24T15:20:00Z - - - - /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000 - - -""".strip() - - -ENTRIES_XML2 = """ - - - - - Example Member - 2019-10-24T15:00:00Z - image10 - A1 - - b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... - - - - - /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000 - - -""".strip() - -# This example isn't in the documentation, but shows how updates work -ENTRIES_XML3 = """ - - - - - - - - /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000 - - -""".strip() - -ENTRIES_XML4 = """ - - - - - - TX Example - 2019-11-25T15:10:00Z - willdelete - - - -""".strip() - -ENTRIES_LARGE_FINGERPRINTS = """ - - - - - - -""".strip() def mock_get_impl(url: str, **params): content = ENTRIES_XML if url.endswith(NEXT_UNESCAPED): content = ENTRIES_XML2 - if url.endswith(NEXT_UNESCAPED2): + elif url.endswith(NEXT_UNESCAPED2): content = ENTRIES_XML3 - if url.endswith(NEXT_UNESCAPED3): + elif url.endswith(NEXT_UNESCAPED3): content = ENTRIES_XML4 + elif url.endswith("/status"): + content = STATUS_XML # Void your warantee by messing with requests state resp = requests.Response() resp._content = content.encode() @@ -216,6 +71,7 @@ def api(monkeypatch: pytest.MonkeyPatch): session = Mock( strict_spec=["get", "__enter__", "__exit__"], get=mock_get_impl, + _put=Mock(), __enter__=lambda _: session, __exit__=lambda *args: None, ) @@ -274,6 +130,14 @@ def assert_fifth_entry(entry: NCMECEntryUpdate) -> None: } +def test_mocked_status(api: NCMECHashAPI): + assert api._my_esp is None + result = api.status() + assert result.esp_id == 1 + assert result.esp_name == "test member" + assert result == api._my_esp + + def test_mocked_get_hashes(api: NCMECHashAPI): result = api.get_entries() @@ -323,3 +187,15 @@ def test_large_fingerprint_entries(monkeypatch): assert len(update.fingerprints) == 1 assert update.fingerprints == {"md5": "facefacefacefacefacefacefaceface"} assert result.next == "" + + +def test_feedback_entries(api: NCMECHashAPI): + # We'll mock that we've already read our own ESP + + api.submit_feedback("image1", FingerprintType.md5, True) + api.submit_feedback( + "image1", + FingerprintType.md5, + False, + "01234567-abcd-0123-4567-012345678900", + ) diff --git a/python-threatexchange/threatexchange/signal_type/signal_base.py b/python-threatexchange/threatexchange/signal_type/signal_base.py index 3e889efa5..393cf6339 100644 --- a/python-threatexchange/threatexchange/signal_type/signal_base.py +++ b/python-threatexchange/threatexchange/signal_type/signal_base.py @@ -6,7 +6,6 @@ import abc import pathlib -import random import typing as t from threatexchange import common