From 2ba45a03049e8a47a5774c0c8acd4349e0ef22c1 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Tue, 16 Jul 2024 14:01:48 -0700
Subject: [PATCH] Draft PR: testing

---
 tensorflow/lite/micro/compression.h           |  70 ++
 tensorflow/lite/micro/compression/BUILD       |  94 ++
 tensorflow/lite/micro/compression/compress.py | 244 +++++
 .../lite/micro/compression/metadata.fbs       |  38 +
 .../micro/compression/metadata_generated.h    | 148 +++
 .../lite/micro/compression/metadata_test.cc   |  71 ++
 .../lite/micro/compression/metadata_test.py   |  67 ++
 .../lite/micro/compression/original.fbs       |  82 ++
 .../lite/micro/compression/original_test.py   |  76 ++
 tensorflow/lite/micro/compression/view.py     | 155 ++++
 .../micro_speech/micro_speech_test.cc         |   7 -
 tensorflow/lite/micro/fake_micro_context.cc   |  82 +-
 tensorflow/lite/micro/fake_micro_context.h    |  36 +-
 tensorflow/lite/micro/kernels/conv.cc         |  51 +-
 tensorflow/lite/micro/kernels/conv.h          |  10 +-
 tensorflow/lite/micro/kernels/conv_common.cc  |  19 +-
 tensorflow/lite/micro/kernels/conv_test.cc    | 349 +++++++-
 tensorflow/lite/micro/kernels/conv_test.h     | 276 +++++-
 .../lite/micro/kernels/conv_test_common.cc    | 104 +--
 .../lite/micro/kernels/fully_connected.cc     |  62 +-
 .../lite/micro/kernels/fully_connected.h      |  10 +-
 .../micro/kernels/fully_connected_test.cc     | 336 ++++++-
 .../lite/micro/kernels/kernel_runner.cc       |  16 +-
 tensorflow/lite/micro/kernels/kernel_runner.h |   9 +-
 tensorflow/lite/micro/kernels/kernel_util.h   |  27 +-
 .../lite/micro/kernels/transpose_conv.cc      |  75 +-
 .../lite/micro/kernels/transpose_conv_test.cc | 840 +++++++++++++++---
 tensorflow/lite/micro/micro_allocator.cc      | 250 +++++-
 tensorflow/lite/micro/micro_allocator.h       |  20 +-
 tensorflow/lite/micro/micro_context.cc        | 135 ++-
 tensorflow/lite/micro/micro_context.h         |  32 +-
 .../lite/micro/micro_interpreter_context.cc   | 102 ++-
 .../lite/micro/micro_interpreter_context.h    |  27 +-
 .../lite/micro/micro_interpreter_test.cc      |  55 +-
 .../lite/micro/recording_micro_allocator.cc   |  30 +-
 .../lite/micro/recording_micro_allocator.h    |  17 +-
 .../micro/recording_micro_allocator_test.cc   |  66 +-
 .../lite/micro/test_helper_custom_ops.cc      | 189 +++-
 .../lite/micro/test_helper_custom_ops.h       |  19 +-
 tensorflow/lite/micro/test_helpers.cc         | 173 +++-
 tensorflow/lite/micro/test_helpers.h          | 110 ++-
 tensorflow/lite/micro/testing/micro_test.h    |   9 +-
 .../lite/micro/tools/benchmarking/metrics.cc  |  30 +-
 .../micro/tools/ci_build/test_x86_default.sh  |   6 +
 tensorflow/lite/micro/tools/make/Makefile     |  14 +
 45 files changed, 4261 insertions(+), 377 deletions(-)
 create mode 100644 tensorflow/lite/micro/compression.h
 create mode 100644 tensorflow/lite/micro/compression/BUILD
 create mode 100644 tensorflow/lite/micro/compression/compress.py
 create mode 100644 tensorflow/lite/micro/compression/metadata.fbs
 create mode 100644 tensorflow/lite/micro/compression/metadata_generated.h
 create mode 100644 tensorflow/lite/micro/compression/metadata_test.cc
 create mode 100644 tensorflow/lite/micro/compression/metadata_test.py
 create mode 100644 tensorflow/lite/micro/compression/original.fbs
 create mode 100644 tensorflow/lite/micro/compression/original_test.py
 create mode 100644 tensorflow/lite/micro/compression/view.py

diff --git a/tensorflow/lite/micro/compression.h b/tensorflow/lite/micro/compression.h
new file mode 100644
index 00000000000..d6a2b27b091
--- /dev/null
+++ b/tensorflow/lite/micro/compression.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_COMPRESSION_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_COMPRESSION_H_
+
+#ifdef USE_TFLM_COMPRESSION
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+//
+// Compressed tensors
+//
+
+static constexpr const char* kCompressionMetadataString = "TFLM_COMPRESSION";
+
+enum class CompressionScheme : uint8_t {
+  kBinQuant,
+};
+
+// TODO(ddavis-2015): pack struct
+struct BinQuantData {
+  static constexpr size_t kMaxBitWidth = 7;
+  static constexpr size_t kMaxValueTableChannelStride = 128;
+
+  const void* value_table;             // Pointer into FlatBuffer Values.
+  uint8_t value_table_channel_stride;  // elements per channel
+  uint8_t compressed_bit_width : 3;    // 1 to 7 bits
+  bool is_per_channel_quantized : 1;   // tensor is per-channel quantized
+  bool use_alternate_axis : 1;         // shape default channel:
+                                       // 0 = first, 1 = last
+  uint8_t reserved : 3;
+};
+
+union CompressionData {
+  BinQuantData bin_quant;
+};
+
+// TODO(ddavis-2015): pack struct
+struct CompressionTensorData {
+  CompressionScheme scheme;
+  CompressionData data;
+};
+
+// TODO(ddavis-2015): pack struct
+struct CompressedTensorList {
+  // Sparsely populated array with the same number of elements as there are
+  // tensors in the Subgraph. An alternative would include a tensor index in
+  // the struct for each and walk the list on look up. This could be slow.
+  CompressionTensorData** tensors;
+};
+
+}  // namespace tflite
+
+#endif  // USE_TFLM_COMPRESSION
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_COMPRESSION_H_
diff --git a/tensorflow/lite/micro/compression/BUILD b/tensorflow/lite/micro/compression/BUILD
new file mode 100644
index 00000000000..cde1b55bb15
--- /dev/null
+++ b/tensorflow/lite/micro/compression/BUILD
@@ -0,0 +1,94 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library", "flatbuffer_py_library")
+load("@rules_python//python:defs.bzl", "py_test")
+load("@tflm_pip_deps//:requirements.bzl", "requirement")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+)
+
+flatbuffer_cc_library(
+    name = "metadata_flatbuffer_cc",
+    srcs = ["metadata.fbs"],
+)
+
+flatbuffer_py_library(
+    name = "original_flatbuffer_py",
+    srcs = ["original.fbs"],
+)
+
+flatbuffer_py_library(
+    name = "metadata_flatbuffer_py",
+    srcs = ["metadata.fbs"],
+)
+
+cc_test(
+    name = "metadata_test_cc",
+    srcs = ["metadata_test.cc"],
+    deps = [
+        "metadata_flatbuffer_cc",
+        "//tensorflow/lite/micro:hexdump",
+        "@flatbuffers//:runtime_cc",
+    ],
+    size = "small",
+)
+
+py_binary(
+    name = "compress",
+    srcs = ["compress.py"],
+    deps = [
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
+        "@flatbuffers//:runtime_py",
+        "metadata_flatbuffer_py",
+        "//tensorflow/lite/python:schema_py",
+        requirement("bitarray"),
+        requirement("numpy"),
+        requirement("scikit-learn"),
+    ],
+)
+
+py_binary(
+    name = "view",
+    srcs = [
+        "view.py",
+    ],
+    deps = [
+        "metadata_flatbuffer_py",
+        "//tensorflow/lite/python:schema_py",
+    ],
+)
+
+py_test(
+    name = "metadata_test_py",
+    main = "metadata_test.py",
+    srcs = ["metadata_test.py"],
+    deps = [
+        "metadata_flatbuffer_py",
+        "@flatbuffers//:runtime_py",
+        requirement("hexdump"),
+    ],
+    size = "small",
+)
+
+py_test(
+    name = "original_test_py",
+    main = "original_test.py",
+    srcs = ["original_test.py"],
+    deps = [
+        "original_flatbuffer_py",
+        "@flatbuffers//:runtime_py",
+        requirement("hexdump"),
+    ],
+    size = "small",
+)
+
+genrule(
+    name = "hello_world_int8.compressed",
+    srcs = ["//tensorflow/lite/micro/examples/hello_world/models:hello_world_int8.tflite"],
+    outs = ["hello_world_int8.compressed.tflite"],
+    cmd = "$(location :compress) --input_model_path $< --output_model_path $@",
+    tools = [":compress"],
+)
diff --git a/tensorflow/lite/micro/compression/compress.py b/tensorflow/lite/micro/compression/compress.py
new file mode 100644
index 00000000000..18834982f24
--- /dev/null
+++ b/tensorflow/lite/micro/compression/compress.py
@@ -0,0 +1,244 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Reduces the number of weights in a .tflite model using various strategies."""
+
+# Usage information:
+# Default:
+#   `bazel run tensorflow/lite/micro/tools:compress -- \
+#     --input_model_path=</path/to/my_model.tflite>` \
+#     --output_model_path=</path/to/output.tflite>`
+
+
+from tensorflow.lite.micro.compression import metadata_flatbuffer_py_generated as compression_schema
+from tensorflow.lite.python import schema_py_generated as tflite_schema
+
+from absl import app
+from absl import flags
+from absl import logging
+import bitarray
+import bitarray.util
+import numpy as np
+import flatbuffers
+import sklearn.cluster
+import struct
+
+
+_INPUT_MODEL_PATH = flags.DEFINE_string(
+    "input_model_path",
+    None,
+    ".tflite input model path",
+    required=True,
+)
+
+_TEST_COMPRESSED_MODEL = flags.DEFINE_bool(
+    "test_compressed_model",
+    False,
+    "optional config to test models with random data and"
+    " report on the differences in output.",
+)
+
+_OUTPUT_MODEL_PATH = flags.DEFINE_string(
+    "output_model_path",
+    None,
+    ".tflite output path. Leave blank if same as input+.compressed.tflite",
+)
+
+
+def read_model(path):
+  with open(path, 'rb') as file:
+    buffer = bytearray(file.read())
+  return tflite_schema.ModelT.InitFromPackedBuf(buffer, 0)
+
+
+def write_model(model, path):
+  builder = flatbuffers.Builder(32)
+  root = model.Pack(builder)
+  builder.Finish(root)
+  buffer: bytearray = builder.Output()
+
+  with open(path, 'wb') as file:
+    file.write(buffer)
+
+
+def pack_compression_metadata(m):
+  builder = flatbuffers.Builder(32)
+  root = m.Pack(builder)
+  builder.Finish(root)
+  buffer: bytearray = builder.Output()
+  return buffer
+
+
+def pack_lut_indexes(indexes, bitwidth):
+  """Pack the sequence of integers given in `indexes` into bitwidth-wide fields
+  in a buffer, and return the buffer. Raise an OverflowError if any element
+  does not fit into a bitwidth-wide field. """
+  ba = bitarray.bitarray(endian="big")
+  for i in indexes:
+    field = bitarray.util.int2ba(i, length=bitwidth, endian="big")
+    ba.extend(field)
+  return ba.tobytes()
+
+
+def pack_lut_values(values, struct_format):
+  """Pack the `values` into a buffer of bytes, using a `struct_format`
+  character from the standard module `struct` to determine the type of values
+  and corresponding encoding into bytes. Always little-endian byte order.
+  """
+  buffer = bytearray()
+  little_endian = "<"
+  packer = struct.Struct(little_endian + struct_format)
+  for v in values:
+    buffer.extend(packer.pack(v))
+  return buffer
+
+
+def unpack_buffer_values(data, struct_format):
+  little_endian = "<"
+  unpacker = struct.Struct(little_endian + struct_format)
+  values = [v[0] for v in unpacker.iter_unpack(bytes(data))]
+  return values
+
+
+def tensor_type_to_struct_format(type):
+  m = {
+    tflite_schema.TensorType.INT8: "b",
+    tflite_schema.TensorType.INT16: "h",
+    tflite_schema.TensorType.FLOAT32: "f",
+  }
+  return m[type]
+
+
+def bq(sequence, num_values):
+  """Quantize a sequence of integers, minimizing the total error using k-means
+  clustering.
+
+  Parameters:
+    sequence :list - a sequence of integers to be quanized
+    num_values :int - the number of quantization levels
+
+  Returns:
+    (indexes, values): a tuple with the list of indexes and list of values
+  """
+  sequence = np.array(sequence).reshape(-1, 1)
+  kmeans = sklearn.cluster.KMeans(n_clusters=num_values,
+                                  random_state=0).fit(sequence)
+  values = kmeans.cluster_centers_.flatten()
+  values = np.round(values).astype(int).tolist()
+  indexes = kmeans.predict(sequence).tolist()
+  return (indexes, values)
+
+
+def compress_tensor(subgraph_id, tensor_id, model):
+  subgraph = model.subgraphs[subgraph_id]
+  tensor = subgraph.tensors[tensor_id]
+  struct_format = tensor_type_to_struct_format(tensor.type)
+  buffer_id = tensor.buffer
+  buffer = model.buffers[buffer_id]
+  sequence = unpack_buffer_values(buffer.data, struct_format)
+  bitwidth = 2
+  indexes, values = bq(sequence, 2 ** bitwidth)
+
+  # append index buffer
+  buffer = tflite_schema.BufferT()
+  buffer.data = pack_lut_indexes(indexes, bitwidth)
+  model.buffers.append(buffer)
+  index_id = len(model.buffers) - 1
+
+  # append value buffer
+  buffer = tflite_schema.BufferT()
+  buffer.data = pack_lut_values(values, struct_format)
+  model.buffers.append(buffer)
+  value_id = len(model.buffers) - 1
+
+  # create metadata
+  lut_tensor = compression_schema.LutTensorT()
+  lut_tensor.subgraph = subgraph_id
+  lut_tensor.tensor = tensor_id
+  lut_tensor.indexBitwidth = bitwidth
+  lut_tensor.indexBuffer = index_id
+  lut_tensor.valueBuffer = value_id
+
+  return lut_tensor
+
+
+def compress_fully_connected(subgraph_id, operator_id, model):
+  # On a fully_connected operator, we compress the 2nd
+  subgraph = model.subgraphs[subgraph_id]
+  operator = subgraph.operators[operator_id]
+  tensor_id_2 = operator.inputs[1]
+  # tensor_id_3 = operator.inputs[2]
+  lut_tensor_2 = compress_tensor(subgraph_id, tensor_id_2, model)
+  # lut_tensor_3 = compress_tensor(subgraph_id, tensor_id_2, model)
+  return (lut_tensor_2,)
+
+
+def get_opcode_compressions(model):
+  """Return a map of operator_code indexes to compression functions, for those
+  operators we wish to and know how to compress.
+  """
+  compressable = {tflite_schema.BuiltinOperator.FULLY_CONNECTED: compress_fully_connected}
+  compressions = {}
+  for index, code in enumerate(model.operatorCodes):
+    if code.builtinCode in compressable:
+      compressions[index] = compressable[code.builtinCode]
+  return compressions
+
+
+def compress(model):
+  # Walk op codes, identify those we compress, note index
+  # Walk operators, match op code indexes, note tensors to compress
+  # Walk those tensors, creating LUTs in buffers and metadata
+
+  compressions = get_opcode_compressions(model)
+
+  lut_tensors = []
+
+  for subgraph_id, subgraph in enumerate(model.subgraphs):
+    for operator_id, operator in enumerate(subgraph.operators):
+      fn = compressions.get(operator.opcodeIndex)
+      if fn is not None:
+        result = fn(subgraph_id, operator_id, model)
+        if result is not None:
+          lut_tensors.extend(result)
+
+  compression_metadata = compression_schema.MetadataT()
+  compression_metadata.lutTensors = lut_tensors
+
+  return compression_metadata
+
+
+def main(_) -> None:
+  output_model_path = _OUTPUT_MODEL_PATH.value or (
+      _INPUT_MODEL_PATH.value.split(".tflite")[0] + ".compressed.tflite")
+  logging.info("compressing %s to %s", _INPUT_MODEL_PATH.value, output_model_path)
+
+  model = read_model(_INPUT_MODEL_PATH.value)
+
+  compression_metadata = compress(model)
+
+  buffer = tflite_schema.BufferT()
+  buffer.data = pack_compression_metadata(compression_metadata)
+  model.buffers.append(buffer)
+
+  metadata = tflite_schema.MetadataT()
+  metadata.name = "COMPRESSION_METADATA"
+  metadata.buffer = len(model.buffers) - 1
+  model.metadata.append(metadata)
+
+  write_model(model, output_model_path)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/lite/micro/compression/metadata.fbs b/tensorflow/lite/micro/compression/metadata.fbs
new file mode 100644
index 00000000000..dcfb1ccafb9
--- /dev/null
+++ b/tensorflow/lite/micro/compression/metadata.fbs
@@ -0,0 +1,38 @@
+// Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Flatbuffer schema describing a TFLM compressed model. Use as the value for
+// the key "TFLM_COMPRESSION" in the metadata table in a .tflite flatbuffer.
+
+namespace tflite.micro.compression;
+
+table Metadata {
+    lut_tensors:[LutTensor];  // list of tensors that are compressed by LUT
+}
+
+struct LutTensor {
+    subgraph:uint16;       // the index of the subgraph
+    tensor:uint16;         // the index of the tensor in its subgraph
+    index_bitwidth:uint8;  // the bit-width of LUT indexes
+    index_buffer:uint16;   // the index of the buffer containing LUT indexes
+    value_buffer:uint16;   // the index of the buffer containing LUT values
+}
+// Look-Up-Table tensors are encoded in two buffers: an index buffer and a
+// value buffer. The indexes are unsigned integers packed into the index buffer
+// in bitwidth-wide bit fields with a big-endian bit order. The data in the
+// value buffer is encoded as usual according to the type of the tensor.
+// Tensors with multiple channels have distinct values tables for each channel,
+// concatinated into one value buffer. (Will elaborate this comment.)
+
+root_type Metadata;
diff --git a/tensorflow/lite/micro/compression/metadata_generated.h b/tensorflow/lite/micro/compression/metadata_generated.h
new file mode 100644
index 00000000000..eaa03cb21e8
--- /dev/null
+++ b/tensorflow/lite/micro/compression/metadata_generated.h
@@ -0,0 +1,148 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_METADATA_TFLITE_MICRO_COMPRESSION_H_
+#define FLATBUFFERS_GENERATED_METADATA_TFLITE_MICRO_COMPRESSION_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace tflite {
+namespace micro {
+namespace compression {
+
+struct Metadata;
+struct MetadataBuilder;
+
+struct LutTensor;
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) LutTensor FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint16_t subgraph_;
+  uint16_t tensor_;
+  uint8_t index_bitwidth_;
+  int8_t padding0__;
+  uint16_t index_buffer_;
+  uint16_t value_buffer_;
+
+ public:
+  LutTensor()
+      : subgraph_(0),
+        tensor_(0),
+        index_bitwidth_(0),
+        padding0__(0),
+        index_buffer_(0),
+        value_buffer_(0) {
+    (void)padding0__;
+  }
+  LutTensor(uint16_t _subgraph, uint16_t _tensor, uint8_t _index_bitwidth, uint16_t _index_buffer, uint16_t _value_buffer)
+      : subgraph_(flatbuffers::EndianScalar(_subgraph)),
+        tensor_(flatbuffers::EndianScalar(_tensor)),
+        index_bitwidth_(flatbuffers::EndianScalar(_index_bitwidth)),
+        padding0__(0),
+        index_buffer_(flatbuffers::EndianScalar(_index_buffer)),
+        value_buffer_(flatbuffers::EndianScalar(_value_buffer)) {
+  }
+  uint16_t subgraph() const {
+    return flatbuffers::EndianScalar(subgraph_);
+  }
+  uint16_t tensor() const {
+    return flatbuffers::EndianScalar(tensor_);
+  }
+  uint8_t index_bitwidth() const {
+    return flatbuffers::EndianScalar(index_bitwidth_);
+  }
+  uint16_t index_buffer() const {
+    return flatbuffers::EndianScalar(index_buffer_);
+  }
+  uint16_t value_buffer() const {
+    return flatbuffers::EndianScalar(value_buffer_);
+  }
+};
+FLATBUFFERS_STRUCT_END(LutTensor, 10);
+
+struct Metadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MetadataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LUT_TENSORS = 4
+  };
+  const flatbuffers::Vector<const tflite::micro::compression::LutTensor *> *lut_tensors() const {
+    return GetPointer<const flatbuffers::Vector<const tflite::micro::compression::LutTensor *> *>(VT_LUT_TENSORS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_LUT_TENSORS) &&
+           verifier.VerifyVector(lut_tensors()) &&
+           verifier.EndTable();
+  }
+};
+
+struct MetadataBuilder {
+  typedef Metadata Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_lut_tensors(flatbuffers::Offset<flatbuffers::Vector<const tflite::micro::compression::LutTensor *>> lut_tensors) {
+    fbb_.AddOffset(Metadata::VT_LUT_TENSORS, lut_tensors);
+  }
+  explicit MetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Metadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Metadata>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Metadata> CreateMetadata(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<const tflite::micro::compression::LutTensor *>> lut_tensors = 0) {
+  MetadataBuilder builder_(_fbb);
+  builder_.add_lut_tensors(lut_tensors);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Metadata> CreateMetadataDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<tflite::micro::compression::LutTensor> *lut_tensors = nullptr) {
+  auto lut_tensors__ = lut_tensors ? _fbb.CreateVectorOfStructs<tflite::micro::compression::LutTensor>(*lut_tensors) : 0;
+  return tflite::micro::compression::CreateMetadata(
+      _fbb,
+      lut_tensors__);
+}
+
+inline const tflite::micro::compression::Metadata *GetMetadata(const void *buf) {
+  return flatbuffers::GetRoot<tflite::micro::compression::Metadata>(buf);
+}
+
+inline const tflite::micro::compression::Metadata *GetSizePrefixedMetadata(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<tflite::micro::compression::Metadata>(buf);
+}
+
+inline bool VerifyMetadataBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::micro::compression::Metadata>(nullptr);
+}
+
+inline bool VerifySizePrefixedMetadataBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::micro::compression::Metadata>(nullptr);
+}
+
+inline void FinishMetadataBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::micro::compression::Metadata> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedMetadataBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::micro::compression::Metadata> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace compression
+}  // namespace micro
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_METADATA_TFLITE_MICRO_COMPRESSION_H_
diff --git a/tensorflow/lite/micro/compression/metadata_test.cc b/tensorflow/lite/micro/compression/metadata_test.cc
new file mode 100644
index 00000000000..74b567c7d14
--- /dev/null
+++ b/tensorflow/lite/micro/compression/metadata_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Test validity of the flatbuffer schema and illustrate use of the flatbuffer
+// machinery with C++.
+
+#include <iostream>
+#include <vector>
+
+#include "metadata_generated.h"
+#include "tensorflow/lite/micro/hexdump.h"
+
+using tflite::micro::compression::LutTensor;
+using tflite::micro::compression::Metadata;
+using tflite::micro::compression::MetadataT;
+
+bool operator==(const LutTensor& a, const LutTensor& b) {
+  return 
+    a.subgraph() == b.subgraph() &&
+    a.tensor() == b.tensor() &&
+    a.index_bitwidth() == b.index_bitwidth() &&
+    a.index_buffer() == b.index_buffer() &&
+    a.value_buffer() == b.value_buffer();
+}
+
+int main(int argc, char* argv[]) {
+  const LutTensor lut_tensor0 {
+    0,   // subgraph
+    127, // tensor
+    2,   // index_bitwidth
+    128, // index_buffer
+    129, // value_buffer
+  };
+  const LutTensor lut_tensor1 {
+    1,   // subgraph
+    164, // tensor
+    2,   // index_bitwidth
+    136, // index_buffer
+    129, // value_buffer
+  };
+  MetadataT metadata;
+  metadata.lut_tensors = {lut_tensor0, lut_tensor1};
+
+  flatbuffers::FlatBufferBuilder builder;
+  auto root = Metadata::Pack(builder, &metadata);
+  builder.Finish(root);
+  const uint8_t* buffer = builder.GetBufferPointer();
+
+  tflite::hexdump(
+      {reinterpret_cast<const std::byte*>(buffer), builder.GetSize()});
+  std::cout << "length: " << builder.GetSize() << "\n";
+
+  auto readback = tflite::micro::compression::GetMetadata(buffer);
+  auto& read_lut_tensor0 = *readback->lut_tensors()->Get(0);
+  auto& read_lut_tensor1 = *readback->lut_tensors()->Get(1);
+  assert(read_lut_tensor0 == lut_tensor0);
+  assert(read_lut_tensor1 == lut_tensor1);
+
+  return 0;
+}
diff --git a/tensorflow/lite/micro/compression/metadata_test.py b/tensorflow/lite/micro/compression/metadata_test.py
new file mode 100644
index 00000000000..3d954154b8a
--- /dev/null
+++ b/tensorflow/lite/micro/compression/metadata_test.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test validity of the flatbuffer schema and illustrate use of the flatbuffer
+# machinery with Python
+
+import sys
+import hexdump
+import flatbuffers
+
+# `.*_generated` is the name of the module created by the Bazel rule
+# `flatbuffer_py_library' based on the schema.
+from tensorflow.lite.micro.compression import metadata_flatbuffer_py_generated as schema
+
+
+def main():
+  # The classes with a `T` suffix provide an object-oriented representation of
+  # the object tree in the flatbuffer using native data structures.
+  lut_tensor0 = schema.LutTensorT()
+  lut_tensor0.subgraph = 1
+  lut_tensor0.tensor = 127
+  lut_tensor0.indexBitwidth = 2
+  lut_tensor0.indexBuffer = 128
+  lut_tensor0.valueBuffer = 129
+
+  lut_tensor1 = schema.LutTensorT()
+  lut_tensor1.subgraph = 1
+  lut_tensor1.tensor = 164
+  lut_tensor1.indexBitwidth = 2
+  lut_tensor1.indexBuffer = 136
+  lut_tensor1.valueBuffer = 129
+
+  metadata = schema.MetadataT()
+  metadata.lutTensors = [lut_tensor0, lut_tensor1]
+
+  # Build the flatbuffer itself using the flatbuffers runtime module.
+  builder = flatbuffers.Builder(32)
+  root = metadata.Pack(builder)
+  builder.Finish(root)
+  buffer: bytearray = builder.Output()
+
+  print(hexdump.hexdump(buffer, result='return'))
+  print(f"length: {len(buffer)}")
+
+  def attrs_equal(a, b):
+    return all(vars(a)[key] == vars(b)[key] for key in vars(a))
+
+  readback = schema.MetadataT.InitFromPackedBuf(buffer, 0)
+  assert attrs_equal(readback.lutTensors[0], lut_tensor0)
+  assert attrs_equal(readback.lutTensors[1], lut_tensor1)
+
+  sys.exit()
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/lite/micro/compression/original.fbs b/tensorflow/lite/micro/compression/original.fbs
new file mode 100644
index 00000000000..3a05a6cd4f2
--- /dev/null
+++ b/tensorflow/lite/micro/compression/original.fbs
@@ -0,0 +1,82 @@
+// Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+namespace tflite.micro;
+
+table ValuesInt8 {
+  values:[int8];
+}
+
+table ValuesInt16 {
+  values:[int16];
+}
+
+table ValuesInt32 {
+  values:[int32];
+}
+
+table ValuesInt64 {
+  values:[int64];
+}
+
+table ValuesFloat32 {
+  values:[float32];
+}
+
+union ValuesUnion {
+  ValuesFloat32,
+  ValuesInt8,
+  ValuesInt16,
+  ValuesInt32,
+  ValuesInt64
+}
+
+table Values {
+  values:ValuesUnion;
+}
+
+table BinQuantBufferOptions {
+  value_table_index:int;
+  compressed_bit_width:uint8; // Should be 2 or 4
+}
+
+union CompressedBufferOptions {
+  BinQuantBufferOptions,
+  // HuffmanBufferOptions, // Future
+}
+
+table CompressedBuffer {
+  buffer_index:int; // Buffer index from the top-level Model buffer vector
+  options:CompressedBufferOptions;
+}
+
+table BinQuantCompression {
+  version:uint8;
+  // For a given value table, if the corresponding buffer was per-tensor quantized, there should be 4 or 16 elements (2 bit or 4 bit indexes).
+  // If the buffer was per-channel quantized, there should be 4/16 x number of channels elements. These will be laid out in the table as:
+  // [c0v0, c0v1, c0v2, c0v3, c1v0, c1v1, ... cNv3]
+  value_tables:[Values];
+}
+
+table CompressionMetadata {
+  // List of compressed buffers
+  buffers:[CompressedBuffer];
+  
+  // (Optional) Model-wide Bin & Quant compression parameters. Only needed if a
+  // CompressedBuffer contains BinQuantBufferOptions.
+  bin_quant_compression:BinQuantCompression;
+}
+
+root_type CompressionMetadata;
diff --git a/tensorflow/lite/micro/compression/original_test.py b/tensorflow/lite/micro/compression/original_test.py
new file mode 100644
index 00000000000..edc8ad4d11f
--- /dev/null
+++ b/tensorflow/lite/micro/compression/original_test.py
@@ -0,0 +1,76 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test validity of the flatbuffer schema and illustrate use of the flatbuffer
+# machinery with Python
+
+import sys
+import hexdump
+import flatbuffers
+
+# `.*_generated` is the name of the module created by the Bazel rule
+# `flatbuffer_py_library' based on the schema.
+from tensorflow.lite.micro.compression import original_flatbuffer_py_generated as schema
+
+
+def main():
+  # The classes with a `T` suffix provide an object-oriented representation of
+  # the object tree in the flatbuffer using native data structures.
+  bq0_options = schema.BinQuantBufferOptionsT()
+  bq0_options.valueTableIndex = 0
+  bq0_options.compressedBitWidth = 2
+
+  bq1_options = schema.BinQuantBufferOptionsT()
+  bq1_options.valueTableIndex = 1
+  bq1_options.compressedBitBidth = 4
+
+  buffer0 = schema.CompressedBufferT()
+  buffer0.bufferIndex = 0
+  buffer0.options = bq0_options
+  buffer0.optionsType = schema.CompressedBufferOptions.BinQuantBufferOptions
+
+  buffer1 = schema.CompressedBufferT()
+  buffer1.bufferIndex = 1
+  buffer1.options = bq1_options
+  buffer1.optionsType = schema.CompressedBufferOptions.BinQuantBufferOptions
+
+  valuesInt8 = schema.ValuesInt8T()
+  valuesInt8.values = [65]
+  values0 = schema.ValuesT()
+  values0.values = valuesInt8
+  values0.values.Type = schema.ValuesUnion.ValuesInt8
+
+  bq_compression = schema.BinQuantCompressionT()
+  bq_compression.valueTables = [values0]
+
+  metadata = schema.CompressionMetadataT()
+  metadata.buffers = [buffer0, buffer1]
+  metadata.binQuantCompression = bq_compression
+
+  # Build the flatbuffer itself using the flatbuffers runtime module.
+  builder = flatbuffers.Builder(32)
+  root = metadata.Pack(builder)
+  builder.Finish(root)
+  buffer: bytearray = builder.Output()
+
+  print(hexdump.hexdump(buffer, result='return'))
+  print(f"length: {len(buffer)}")
+
+  readback = schema.CompressionMetadataT.InitFromPackedBuf(buffer, 0)
+
+  sys.exit()
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/lite/micro/compression/view.py b/tensorflow/lite/micro/compression/view.py
new file mode 100644
index 00000000000..55c4255ede1
--- /dev/null
+++ b/tensorflow/lite/micro/compression/view.py
@@ -0,0 +1,155 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pprint
+
+from tensorflow.lite.micro.compression import metadata_flatbuffer_py_generated as compression_schema
+from tensorflow.lite.python import schema_py_generated as tflite_schema
+
+
+def read_model(path):
+  with open(path, 'rb') as file:
+    buffer = bytearray(file.read())
+  return tflite_schema.ModelT.InitFromPackedBuf(buffer, 0)
+
+
+def unpack_list(source):
+  result = []
+  for index, s in enumerate(source):
+    d = {"_index": index} | vars(s)
+    result.append(d)
+  return result
+
+
+def unpack_operators(operators):
+  result = []
+  for index, o in enumerate(operators):
+    d = {"_index": index,
+         "opcode_index": o.opcodeIndex,
+         "inputs": unpack_array(o.inputs),
+         "outputs": unpack_array(o.outputs),
+         }
+    result.append(d)
+  return result
+
+
+def unpack_TensorType(type):
+  attrs = [attr for attr in dir(tflite_schema.TensorType) if not
+           attr.startswith("__")]
+  lut = {getattr(tflite_schema.TensorType, attr): attr for attr in attrs}
+  return lut[type]
+
+
+def unpack_tensors(tensors):
+  result = []
+  for index, t in enumerate(tensors):
+    d = {"_index": index,
+         "name": t.name.decode("utf-8"),
+         "type": unpack_TensorType(t.type),
+         "shape": unpack_array(t.shape),
+         "quantization": [unpack_array(t.quantization.scale), unpack_array(t.quantization.zeroPoint)],
+         "buffer": t.buffer,
+         }
+    result.append(d)
+  return result
+
+
+def unpack_subgraphs(subgraphs):
+  result = []
+  for index, s in enumerate(subgraphs):
+    d = {"_index": index,
+         "name": s.name,
+         # "inputs": s.inputs,
+         # "outputs": s.outputs,
+         "operators": unpack_operators(s.operators),
+         "tensors": unpack_tensors(s.tensors),
+         }
+    result.append(d)
+  return result
+
+
+def unpack_metadata(metadata):
+  return [{"name": m.name.decode("utf-8"), "buffer": m.buffer} for m in
+          metadata]
+
+
+def unpack_compression_metadata(buffer):
+  metadata = compression_schema.MetadataT.InitFromPackedBuf(buffer, 0)
+  result = []
+  for index, t in enumerate(metadata.lutTensors):
+    d = {"_index": index,
+         "subgraph": t.subgraph,
+         "tensor": t.tensor,
+         "indexBitwidth": t.indexBitwidth,
+         "indexBuffer": t.indexBuffer,
+         "valueBuffer": t.valueBuffer,
+         }
+    result.append(d)
+  return {"lut_tensors": result}
+
+
+def unpack_array(a):
+  try:
+    # Avoid printing as numpy arrays if possible. The pprint module does not
+    # format them well.
+    a = a.tolist()
+  except AttributeError:
+    pass
+  return a
+
+
+def unpack_buffers(buffers, compression_metadata=None):
+  result = []
+  for index, b in enumerate(buffers):
+    d = {"_index": index}
+    d = d | {"data": unpack_array(b.data)}
+    if index == compression_metadata: d = d | {"_compression_metadata_decoded":
+                                               unpack_compression_metadata(bytes(b.data))}
+    result.append(d)
+  return result
+
+
+def get_compression_metadata_buffer(model):
+  # Return the metadata buffer data or None
+  for item in model.metadata:
+    if item.name.decode("utf-8") == "COMPRESSION_METADATA":
+      return item.buffer
+  else:
+    return None
+
+
+def print_model(model, format=None):
+  output = {
+      "description": model.description.decode("utf-8"),
+      "version": model.version,
+      "operator_codes": unpack_list(model.operatorCodes),
+      "metadata": unpack_metadata(model.metadata),
+      "subgraphs": unpack_subgraphs(model.subgraphs),
+      "buffers": unpack_buffers(model.buffers,
+                                get_compression_metadata_buffer(model)),
+      }
+
+  pprint.pprint(output, width=90, sort_dicts=False, compact=True)
+
+
+def main(argv=None):
+  filename = argv[1]
+  model = read_model(filename)
+  print_model(model)
+
+
+if __name__ == "__main__":
+  import sys
+  main(sys.argv)
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index f31728c3707..6fe75c18c15 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -32,13 +32,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
-#define TF_LITE_MICRO_CHECK_FAIL()   \
-  do {                               \
-    if (micro_test::did_test_fail) { \
-      return kTfLiteError;           \
-    }                                \
-  } while (false)
-
 namespace {
 
 // Arena size is a guesstimate, followed by use of
diff --git a/tensorflow/lite/micro/fake_micro_context.cc b/tensorflow/lite/micro/fake_micro_context.cc
index 5787ffd0648..1ee2c65f5e1 100644
--- a/tensorflow/lite/micro/fake_micro_context.cc
+++ b/tensorflow/lite/micro/fake_micro_context.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,18 +15,34 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/fake_micro_context.h"
 
+#include <algorithm>
+
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/arena_allocator/single_arena_buffer_allocator.h"
 #include "tensorflow/lite/micro/micro_arena_constants.h"
 #include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 
-FakeMicroContext::FakeMicroContext(TfLiteTensor* tensors,
-                                   SingleArenaBufferAllocator* allocator,
-                                   MicroGraph* micro_graph)
-    : graph_(*micro_graph), tensors_(tensors), allocator_(allocator) {}
+FakeMicroContext::FakeMicroContext(
+    TfLiteTensor* tensors, SingleArenaBufferAllocator* allocator,
+    MicroGraph* micro_graph
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const CompressedTensorList* compressed_tensors
+#endif  // USE_TFLM_COMPRESSION
+    )
+    : graph_(*micro_graph),
+      tensors_(tensors),
+      allocator_(allocator)
+#ifdef USE_TFLM_COMPRESSION
+      ,
+      compressed_tensors_(compressed_tensors)
+#endif  // USE_TFLM_COMPRESSION
+{
+}
 
 TfLiteTensor* FakeMicroContext::AllocateTempTfLiteTensor(int tensor_index) {
   allocated_temp_count_++;
@@ -112,4 +128,60 @@ void* FakeMicroContext::external_context() { return nullptr; }
 
 MicroGraph& FakeMicroContext::graph() { return graph_; }
 
+#ifdef USE_TFLM_COMPRESSION
+
+// Available during Prepare & Eval. Returns false if tensor is not
+// compressed.
+bool FakeMicroContext::IsTensorCompressed(const TfLiteNode* node,
+                                          int tensor_idx) {
+  if (compressed_tensors_ != nullptr && tensor_idx < node->inputs->size) {
+    int index = node->inputs->data[tensor_idx];
+    if (index >= 0 && compressed_tensors_->tensors[index] != nullptr) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Only available during Prepare. The kernel is responsible for storing the
+// scratch buffer handle.
+int FakeMicroContext::AllocateDecompressionScratchBuffer(const TfLiteNode* node,
+                                                         int tensor_idx) {
+  if (compressed_tensors_ == nullptr || tensor_idx >= node->inputs->size) {
+    return -1;
+  }
+  int index = node->inputs->data[tensor_idx];
+  if (index < 0 || compressed_tensors_->tensors[index] == nullptr) {
+    return -1;
+  }
+  TfLiteTensor* tensor = &tensors_[index];
+  int scratch_index = -1;
+  TfLiteStatus result =
+      RequestScratchBufferInArena(tensor->bytes, &scratch_index);
+  if (result != kTfLiteOk) {
+    return -1;
+  }
+
+  return scratch_index;
+}
+
+// Available during Prepare & Eval. Returns nullptr if tensor is not
+// compressed.
+const CompressionTensorData* FakeMicroContext::GetTensorCompressionData(
+    const TfLiteNode* node, int tensor_idx) {
+  if (compressed_tensors_ == nullptr || tensor_idx >= node->inputs->size) {
+    return nullptr;
+  }
+
+  int index = node->inputs->data[tensor_idx];
+  if (index < 0) {
+    return nullptr;
+  }
+
+  return compressed_tensors_->tensors[index];
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/fake_micro_context.h b/tensorflow/lite/micro/fake_micro_context.h
index 46d8a9b1ec4..7cf9c682e5c 100644
--- a/tensorflow/lite/micro/fake_micro_context.h
+++ b/tensorflow/lite/micro/fake_micro_context.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -30,7 +30,12 @@ class FakeMicroContext : public MicroContext {
   ~FakeMicroContext() = default;
 
   FakeMicroContext(TfLiteTensor* tensors, SingleArenaBufferAllocator* allocator,
-                   MicroGraph* micro_graph);
+                   MicroGraph* micro_graph
+#ifdef USE_TFLM_COMPRESSION
+                   ,
+                   const CompressedTensorList* compressed_tensors = nullptr
+#endif  // USE_TFLM_COMPRESSION
+  );
 
   void* AllocatePersistentBuffer(size_t bytes) override;
   TfLiteStatus RequestScratchBufferInArena(size_t bytes,
@@ -50,6 +55,24 @@ class FakeMicroContext : public MicroContext {
   void* external_context() override;
   MicroGraph& graph() override;
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // Available during Prepare & Eval. Returns false if tensor is not
+  // compressed.
+  bool IsTensorCompressed(const TfLiteNode* node, int tensor_idx) override;
+
+  // Only available during Prepare. The kernel is responsible for storing the
+  // scratch buffer handle.
+  int AllocateDecompressionScratchBuffer(const TfLiteNode* node,
+                                         int tensor_idx) override;
+
+  // Available during Prepare & Eval. Returns nullptr if tensor is not
+  // compressed.
+  const CompressionTensorData* GetTensorCompressionData(
+      const TfLiteNode* node, int tensor_idx) override;
+
+#endif  // USE_TFLM_COMPRESSION
+
  private:
   static constexpr int kNumScratchBuffers_ = 12;
 
@@ -62,6 +85,15 @@ class FakeMicroContext : public MicroContext {
 
   SingleArenaBufferAllocator* allocator_;
 
+#ifdef USE_TFLM_COMPRESSION
+
+  //
+  // Compression
+  //
+  const CompressedTensorList* compressed_tensors_;
+
+#endif  // USE_TFLM_COMPRESSION
+
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
 
diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 0df35fce4eb..7364d609e5b 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -45,15 +45,36 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   const auto& data = *(static_cast<const OpDataConv*>(node->user_data));
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // TODO(ddavis-2015): make micro_context a const pointer
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* weights_comp_td =
+      micro_context->GetTensorCompressionData(node, kConvWeightsTensor);
+  const CompressionTensorData* bias_comp_td =
+      micro_context->GetTensorCompressionData(node, kConvBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32: {
       tflite::reference_ops::Conv(
           ConvParamsFloat(params, data), tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<float>(input),
           tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(micro_context, filter,
+                                              weights_comp_td,
+                                              data.weights_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(micro_context, bias, bias_comp_td,
+                                              data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorData<float>(filter),
           tflite::micro::GetTensorShape(bias),
           tflite::micro::GetOptionalTensorData<float>(bias),
+#endif  // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<float>(output),
           tflite::micro::GetTensorShape(nullptr), nullptr);
@@ -67,9 +88,18 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorShape(input),
             tflite::micro::GetTensorData<int16_t>(input),
             tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                 weights_comp_td,
+                                                 data.weights_scratch_index),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetTensorData<int32_t>(
+                micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorData<int8_t>(filter),
             tflite::micro::GetTensorShape(bias),
             tflite::micro::GetOptionalTensorData<std::int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorShape(output),
             tflite::micro::GetTensorData<int16_t>(output));
       } else if (bias->type == kTfLiteInt64) {
@@ -79,9 +109,18 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorShape(input),
             tflite::micro::GetTensorData<int16_t>(input),
             tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                 weights_comp_td,
+                                                 data.weights_scratch_index),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetTensorData<int64_t>(
+                micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorData<int8_t>(filter),
             tflite::micro::GetTensorShape(bias),
             tflite::micro::GetOptionalTensorData<std::int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorShape(output),
             tflite::micro::GetTensorData<int16_t>(output));
       } else {
@@ -119,9 +158,19 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
               tflite::micro::GetTensorShape(input),
               tflite::micro::GetTensorData<int8_t>(input),
               tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   weights_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetTensorData<int32_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorData<int8_t>(filter),
               tflite::micro::GetTensorShape(bias),
               tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+
               tflite::micro::GetTensorShape(output),
               tflite::micro::GetTensorData<int8_t>(output));
           break;
diff --git a/tensorflow/lite/micro/kernels/conv.h b/tensorflow/lite/micro/kernels/conv.h
index 0c8073f48f0..0090053e03c 100644
--- a/tensorflow/lite/micro/kernels/conv.h
+++ b/tensorflow/lite/micro/kernels/conv.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -49,6 +49,14 @@ struct OpDataConv {
   // A buffer used to store unpacked filter values. This is used if the source
   // tensor is of n-bit precision that cannot be easily processed by kernels.
   int filter_buffer_index;
+
+#ifdef USE_TFLM_COMPRESSION
+
+  // scratch buffers for compressed tensors
+  int weights_scratch_index;
+  int bias_scratch_index;
+
+#endif  // USE_TFLM_COMPRESSION
 };
 
 extern const int kConvInputTensor;
diff --git a/tensorflow/lite/micro/kernels/conv_common.cc b/tensorflow/lite/micro/kernels/conv_common.cc
index 51c7a6ff2d6..9f0f2f79588 100644
--- a/tensorflow/lite/micro/kernels/conv_common.cc
+++ b/tensorflow/lite/micro/kernels/conv_common.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -209,6 +209,23 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
                                          &data->filter_buffer_index);
   }
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // Compression scratch buffers.
+  // These will only be allocated if the tensor is compressed.
+  if (micro_context->IsTensorCompressed(node, kConvWeightsTensor) &&
+      filter->type == kTfLiteInt4) {
+    MicroPrintf("Compression not supported with INT4 tensors");
+    return kTfLiteError;
+  }
+  data->weights_scratch_index =
+      micro_context->AllocateDecompressionScratchBuffer(node,
+                                                        kConvWeightsTensor);
+  data->bias_scratch_index =
+      micro_context->AllocateDecompressionScratchBuffer(node, kConvBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
   micro_context->DeallocateTempTfLiteTensor(filter);
   micro_context->DeallocateTempTfLiteTensor(input);
   micro_context->DeallocateTempTfLiteTensor(output);
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 0fb9411a3f0..0c3e0f06937 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/conv_test.h"
 
+#include <type_traits>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
@@ -46,6 +48,83 @@ static int kOutputShape[] = {4, 2, 1, 2, 3};
 static const float kGoldenData[kOutputElements] = {18, 2, 5, 18, 2, 5,
                                                    17, 4, 3, 37, 4, 3};
 
+#ifdef USE_TFLM_COMPRESSION
+
+// compressed filter data for kBinQuant scheme, matches kFilterData
+constexpr uint8_t kBinQuantFilterData[] = {
+    0x05, 0x38, 0x20, 0x90, 0x00,
+};
+constexpr float kBinQuantFilterValueTable[] = {
+    1, 2, 3, 4, -1,
+};
+constexpr int kBinQuantFilterBitWidth = 3;
+// compressed bias data for kBinQuant scheme, matches kBiasData
+constexpr uint8_t kBinQuantBiasData[] = {0x18};
+constexpr int kBinQuantBiasBitWidth = 2;
+
+// Common inputs and outputs for quantized compressed tensor tests.
+// Values from TfLite conv_test.cc SimplePerChannelTest.
+static int kInputShapeQ1[] = {4, 1, 2, 3, 2};
+static const float kInputDataQ1[] = {
+    // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+    3,  2,   // batch = 0, y = 0, x = 0
+    1,  -1,  // batch = 0, y = 0, x = 1
+    -2, -3,  // batch = 0, y = 0, x = 2
+    4,  3,   // batch = 0, y = 1, x = 0
+    2,  -2,  // batch = 0, y = 1, x = 1
+    -3, -4,  // batch = 0, y = 1, x = 2
+};
+constexpr size_t kInputElementsQ1 = std::extent<decltype(kInputDataQ1)>::value;
+
+constexpr int kFilterNumChannelsQ1 = 2;
+static int kFilterShapeQ1[] = {4, 2, 2, 2, 2};
+static const float kFilterDataQ1[] = {
+    // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+    1, 2,  // out channel = 0, y = 0, x = 0
+    3, 4,  // out channel = 0, y = 0, x = 1
+    3, 4,  // out channel = 0, y = 1, x = 0
+    5, 6,  // out channel = 0, y = 1, x = 1
+    7, 8,  // out channel = 1, y = 0, x = 0
+    5, 6,  // out channel = 1, y = 0, x = 1
+    3, 4,  // out channel = 1, y = 1, x = 0
+    1, 2,  // out channel = 1, y = 1, x = 1
+};
+constexpr size_t kFilterElementsQ1 =
+    std::extent<decltype(kFilterDataQ1)>::value;
+
+static int kBiasShapeQ1[] = {1, 2};
+static const float kBiasDataQ1[] = {3, -2};
+constexpr size_t kBiasElementsQ1 = std::extent<decltype(kBiasDataQ1)>::value;
+
+static int kOutputShapeQ1[] = {4, 1, 1, 2, 2};
+static const float kGoldenDataQ1[] = {31, 64, -57, -46};
+constexpr int kOutputElementsQ1 = std::extent<decltype(kGoldenDataQ1)>::value;
+static const float kGoldenDataQ1_16[] = {31, 63.99804688, -57, -46};
+
+// compressed filter data for kBinQuant scheme, matches kFilterDataQ1
+constexpr uint8_t kBinQuantFilterDataQ1[] = {
+    0x05, 0x34, 0xE5, 0xDE, 0x54, 0xC1,
+};
+constexpr float kBinQuantFilterValueTableQ1[] = {
+    1, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+};
+constexpr int kBinQuantFilterBitWidthQ1 = 3;
+// compressed bias data for kBinQuant scheme, matches kBiasDataQ1
+constexpr uint8_t kBinQuantBiasDataQ1[] = {0x00};
+constexpr int kBinQuantBiasBitWidthQ1 = 1;
+
+static TfLiteConvParams common_conv_params_q1 = {
+    kTfLitePaddingValid,  // padding
+    1,                    // stride_width
+    1,                    // stride_height
+    kTfLiteActNone,       // activation
+    1,                    // dilation_width_factor
+    1,                    // dilation_height_factor
+    kTfLiteNoType         // quantized_bias_type
+};
+
+#endif  // USE_TFLM_COMPRESSION
+
 static TfLiteConvParams common_conv_params = {
     kTfLitePaddingValid,  // padding
     2,                    // stride_width
@@ -122,6 +201,66 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
           output_data));
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelCompressed) {
+  const float input_scale = 0.5f;
+  const float output_scale = 0.5f;
+  const int input_zero_point = -1;
+  const int output_zero_point = -1;
+  constexpr float filter_scales[] = {tflite::testing::kFilterNumChannelsQ1,
+                                     1.0f, 2.0f};
+  constexpr int filter_zero_points[] = {tflite::testing::kFilterNumChannelsQ1,
+                                        0, 0};
+  // bias scales and zero points will be computed
+  float bias_scales[std::extent<decltype(filter_scales)>::value] = {};
+  int bias_zero_points[std::extent<decltype(filter_scales)>::value] = {};
+
+  int8_t input_quantized[tflite::testing::kInputElementsQ1];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ1];
+  int32_t bias_quantized[tflite::testing::kBiasElementsQ1];
+  int8_t golden_quantized[tflite::testing::kOutputElementsQ1];
+  int8_t output_quantized[tflite::testing::kOutputElementsQ1];
+
+  tflite::testing::TestCompressionQuantizedInfo<int32_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+
+  comp_info.filter_value_table = filter_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<
+          decltype(tflite::testing::kBinQuantFilterValueTableQ1)>::value /
+      tflite::testing::kFilterNumChannelsQ1;
+  comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ1;
+  comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ1;
+  comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ1;
+  comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1;
+  comp_info.filter_scales = filter_scales;
+  comp_info.filter_zero_points = filter_zero_points;
+
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasDataQ1)>::value /
+      tflite::testing::kFilterNumChannelsQ1;
+  comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ1;
+  comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ1;
+  comp_info.bias_data = tflite::testing::kBiasDataQ1;
+  comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1;
+  comp_info.bias_scales = bias_scales;
+  comp_info.bias_zero_points = bias_zero_points;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestConvQuantizedPerChannelCompressed(
+          tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1,
+          golden_quantized, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params_q1, tflite::Register_CONV_2D(),
+          &comp_info));
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 TF_LITE_MICRO_TEST(SimpleTestFloat) {
   float output_data[tflite::testing::kOutputElements];
 
@@ -136,6 +275,37 @@ TF_LITE_MICRO_TEST(SimpleTestFloat) {
           output_data));
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestFloatCompressed) {
+  tflite::testing::TestCompressionInfo<const float, const float> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = tflite::testing::kBinQuantFilterValueTable;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(tflite::testing::kFilterData)>::value;
+  comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth;
+  comp_info.bias_value_table = tflite::testing::kBiasData;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasData)>::value;
+  comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth;
+
+  float output_data[tflite::testing::kOutputElements];
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestConvFloat(
+          tflite::testing::kInputShape, tflite::testing::kInputData,
+          tflite::testing::kFilterShape,
+          reinterpret_cast<const float*>(tflite::testing::kBinQuantFilterData),
+          tflite::testing::kBiasShape,
+          reinterpret_cast<const float*>(tflite::testing::kBinQuantBiasData),
+          tflite::testing::kOutputShape, tflite::testing::kGoldenData,
+          &tflite::testing::common_conv_params, tflite::Register_CONV_2D(),
+          output_data, &comp_info));
+}
+
+#endif
+
 TF_LITE_MICRO_TEST(InputAndFilterSameWidthHeight) {
   const int output_dims_count = 2;
   float output_data[output_dims_count];
@@ -246,6 +416,66 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel64bBias) {
           output_data));
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel64bBiasCompressed) {
+  const float input_scale = 128.0f / 65536;
+  const float output_scale = 128.0f / 65536;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+  constexpr float filter_scales[] = {tflite::testing::kFilterNumChannelsQ1,
+                                     1.0f, 2.0f};
+  constexpr int filter_zero_points[] = {tflite::testing::kFilterNumChannelsQ1,
+                                        0, 0};
+  // bias scales and zero points will be computed
+  float bias_scales[std::extent<decltype(filter_scales)>::value] = {};
+  int bias_zero_points[std::extent<decltype(filter_scales)>::value] = {};
+
+  int16_t input_quantized[tflite::testing::kInputElementsQ1];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ1];
+  int64_t bias_quantized[tflite::testing::kBiasElementsQ1];
+  int16_t golden_quantized[tflite::testing::kOutputElementsQ1];
+  int16_t output_quantized[tflite::testing::kOutputElementsQ1];
+
+  tflite::testing::TestCompressionQuantizedInfo<int64_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+
+  comp_info.filter_value_table = filter_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<
+          decltype(tflite::testing::kBinQuantFilterValueTableQ1)>::value /
+      tflite::testing::kFilterNumChannelsQ1;
+  comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ1;
+  comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ1;
+  comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ1;
+  comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1;
+  comp_info.filter_scales = filter_scales;
+  comp_info.filter_zero_points = filter_zero_points;
+
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasDataQ1)>::value /
+      tflite::testing::kFilterNumChannelsQ1;
+  comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ1;
+  comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ1;
+  comp_info.bias_data = tflite::testing::kBiasDataQ1;
+  comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1;
+  comp_info.bias_scales = bias_scales;
+  comp_info.bias_zero_points = bias_zero_points;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestConvQuantizedPerChannelCompressed(
+          tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1_16,
+          golden_quantized, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params_q1, tflite::Register_CONV_2D(),
+          &comp_info));
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel32bBias) {
   const int output_dims_count = 12;
   int16_t output_data[output_dims_count];
@@ -276,6 +506,66 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel32bBias) {
           output_data));
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel32bBiasCompressed) {
+  const float input_scale = 128.0f / 65536;
+  const float output_scale = 128.0f / 65536;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+  constexpr float filter_scales[] = {tflite::testing::kFilterNumChannelsQ1,
+                                     1.0f, 2.0f};
+  constexpr int filter_zero_points[] = {tflite::testing::kFilterNumChannelsQ1,
+                                        0, 0};
+  // bias scales and zero points will be computed
+  float bias_scales[std::extent<decltype(filter_scales)>::value] = {};
+  int bias_zero_points[std::extent<decltype(filter_scales)>::value] = {};
+
+  int16_t input_quantized[tflite::testing::kInputElementsQ1];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ1];
+  int32_t bias_quantized[tflite::testing::kBiasElementsQ1];
+  int16_t golden_quantized[tflite::testing::kOutputElementsQ1];
+  int16_t output_quantized[tflite::testing::kOutputElementsQ1];
+
+  tflite::testing::TestCompressionQuantizedInfo<int32_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+
+  comp_info.filter_value_table = filter_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<
+          decltype(tflite::testing::kBinQuantFilterValueTableQ1)>::value /
+      tflite::testing::kFilterNumChannelsQ1;
+  comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ1;
+  comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ1;
+  comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ1;
+  comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1;
+  comp_info.filter_scales = filter_scales;
+  comp_info.filter_zero_points = filter_zero_points;
+
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasDataQ1)>::value /
+      tflite::testing::kFilterNumChannelsQ1;
+  comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ1;
+  comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ1;
+  comp_info.bias_data = tflite::testing::kBiasDataQ1;
+  comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1;
+  comp_info.bias_scales = bias_scales;
+  comp_info.bias_zero_points = bias_zero_points;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestConvQuantizedPerChannelCompressed(
+          tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1_16,
+          golden_quantized, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params_q1, tflite::Register_CONV_2D(),
+          &comp_info));
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 TF_LITE_MICRO_TEST(SimpleTestDilatedQuantizedPerChannel) {
   const int output_dims_count = 24;
   int8_t output_data[output_dims_count];
@@ -1190,3 +1480,60 @@ TF_LITE_MICRO_TEST(Int8Filter8x3x3x3PerChannelScaleRelu6ShouldMatchGolden) {
 }
 
 TF_LITE_MICRO_TESTS_END
+
+// {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+//       {TensorType_INT8,
+//        // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+//        {2, 2, 2, 2},
+//        0,
+//        0,
+//        0,
+//        0,
+//        /*per_channel_quantization=*/true,
+//        /*per_channel_quantization_scales=*/{1, 2},
+//        /*per_channel_quantization_offsets=*/{0, 0},
+//        /*channel_index=*/0},
+//       {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+//       /*stride_width=*/1, /*stride_height=*/1);
+//   m.SetInput<int8_t>({
+//       // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+//       3, 2,    // batch = 0, y = 0, x = 0
+//       1, -1,   // batch = 0, y = 0, x = 1
+//       -2, -3,  // batch = 0, y = 0, x = 2
+//       4, 3,    // batch = 0, y = 1, x = 0
+//       2, -2,   // batch = 0, y = 1, x = 1
+//       -3, -4,  // batch = 0, y = 1, x = 2
+//   });
+//   m.SetFilter(
+//       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+//       {
+//           1, 2,  // out channel = 0, y = 0, x = 0
+//           3, 4,  // out channel = 0, y = 0, x = 1
+//           3, 4,  // out channel = 0, y = 1, x = 0
+//           5, 6,  // out channel = 0, y = 1, x = 1
+//           7, 8,  // out channel = 1, y = 0, x = 0
+//           5, 6,  // out channel = 1, y = 0, x = 1
+//           3, 4,  // out channel = 1, y = 1, x = 0
+//           1, 2,  // out channel = 1, y = 1, x = 1
+//       });
+//   m.SetBias({3, -2});
+//     // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+//   EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+//               ElementsAreArray(ArrayFloatNear({31, 64, -57, -46})));
+//   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({61, 127, -115, -93}));
+
+//   TEST_P(ConvolutionOpTest, SimplePerChannel16x8Bias32) {
+//   const float scale = 128.0 / 65536;
+//     // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+//   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+//               ElementsAreArray(ArrayFloatNear({31, 63.99804688, -57, -46})));
+//   EXPECT_THAT(m.GetOutput<int16_t>(),
+//               ElementsAreArray({15872, 32767, -29184, -23552}));
+
+//   TEST_P(ConvolutionOpTest, SimplePerChannel16x8Bias64) {
+//   const float scale = 128.0 / 65536;
+//     // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+//   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+//               ElementsAreArray(ArrayFloatNear({31, 63.99804688, -57, -46})));
+//   EXPECT_THAT(m.GetOutput<int16_t>(),
+//               ElementsAreArray({15872, 32767, -29184, -23552}));
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/conv_test.h b/tensorflow/lite/micro/kernels/conv_test.h
index c655f043bcc..9df52b6b250 100644
--- a/tensorflow/lite/micro/kernels/conv_test.h
+++ b/tensorflow/lite/micro/kernels/conv_test.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/conv.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/test_helpers.h"
@@ -26,35 +27,180 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
-                        int output_length, TfLiteConvParams* conv_params,
-                        TFLMRegistration registration, float* output_data);
+constexpr int kMaxTensors = 4;
+
+#ifdef USE_TFLM_COMPRESSION
+
+template <typename TFILTER, typename TBIAS>
+struct TestCompressionInfo {
+  TFILTER* filter_value_table;
+  size_t filter_value_table_stride;
+  int filter_bit_width;
+  TBIAS* bias_value_table;
+  size_t bias_value_table_stride;
+  int bias_bit_width;
+  CompressionScheme scheme;
+};
+
+template <typename TBIAS>
+struct TestCompressionQuantizedInfo : TestCompressionInfo<int8_t, TBIAS> {
+  const uint8_t* filter_compressed;
+  const float* filter_data;
+  const int* filter_dims_data;    // TfLiteIntArray
+  const float* filter_scales;     // TfLiteFloatArray
+  const int* filter_zero_points;  // TfLiteIntArray
 
+  const uint8_t* bias_compressed;
+  const float* bias_data;
+  const int* bias_dims_data;  // TfLiteIntArray
+  float* bias_scales;         // TfLiteFloatArray (computed)
+  int* bias_zero_points;      // TfLiteIntArray (computed)
+};
+
+#endif  // USE_TFLM_COMPRESSION
+
+template <typename T>
 TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
-                        int output_length, TfLiteConvParams* conv_params,
-                        TFLMRegistration registration, int8_t* output_data);
-
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 const float* expected_output_data,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 TFLMRegistration registration,
-                                 float* output_data, float tolerance = 1e-5);
-
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 const int8_t* expected_output_data,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 TFLMRegistration registration,
-                                 int8_t* output_data, float tolerance = 1e-5);
-
-TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data,
-                           int* filter_dims_data, const float* filter_data,
-                           int* bias_dims_data, const float* bias_data,
-                           int* output_dims_data,
-                           const float* expected_output_data,
-                           TfLiteConvParams* conv_params,
-                           TFLMRegistration registration, float* output_data);
+                        int output_length, const TfLiteConvParams* conv_params,
+                        TFLMRegistration registration, T* output_data
+#ifdef USE_TFLM_COMPRESSION
+                        ,
+                        const CompressedTensorList* comp_list_p = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
+  // TODO(ddavis-2015): support optional bias tensor
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, conv_params
+#ifdef USE_TFLM_COMPRESSION
+                             ,
+                             nullptr, comp_list_p
+#endif  // USE_TFLM_COMPRESSION
+  );
+
+  const char* init_data = reinterpret_cast<const char*>(conv_params);
+  TfLiteStatus status = runner.InitAndPrepare(init_data);
+  if (status != kTfLiteOk) {
+    return status;
+  }
+  return runner.Invoke();
+}
+
+template <typename T, typename CTF = void, typename CTB = void>
+TfLiteStatus ValidateConvGoldens(
+    TfLiteTensor* tensors, int tensors_size, const T* expected_output_data,
+    int output_length, const TfLiteConvParams* conv_params,
+    TFLMRegistration registration, T* output_data, float tolerance = 1e-5
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const TestCompressionInfo<CTF, CTB>* comp_info = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
+#ifdef USE_TFLM_COMPRESSION
+
+  CompressionTensorData* compressed_tensors[kMaxTensors] = {};
+  CompressionTensorData filter_comp_data = {};
+  CompressionTensorData bias_comp_data = {};
+  CompressedTensorList comp_list = {compressed_tensors};
+  CompressedTensorList* comp_list_p = nullptr;
+
+  if (comp_info != nullptr) {
+    if (comp_info->scheme == CompressionScheme::kBinQuant) {
+      if (comp_info->filter_value_table != nullptr) {
+        bool is_per_channel =
+            tensors[kConvWeightsTensor].type != kTfLiteFloat32 &&
+            tensors[kConvWeightsTensor].dims->data[kConvQuantizedDimension] > 1;
+        compressed_tensors[kConvWeightsTensor] = &filter_comp_data;
+        filter_comp_data.scheme = CompressionScheme::kBinQuant;
+        filter_comp_data.data.bin_quant.compressed_bit_width =
+            comp_info->filter_bit_width;
+        filter_comp_data.data.bin_quant.value_table =
+            comp_info->filter_value_table;
+        filter_comp_data.data.bin_quant.value_table_channel_stride =
+            comp_info->filter_value_table_stride;
+        filter_comp_data.data.bin_quant.is_per_channel_quantized =
+            is_per_channel;
+        filter_comp_data.data.bin_quant.use_alternate_axis = false;
+      }
+      if (comp_info->bias_value_table != nullptr) {
+        bool is_per_channel =
+            tensors[kConvBiasTensor].type != kTfLiteFloat32 &&
+            tensors[kConvBiasTensor].dims->data[kConvQuantizedDimension] > 1;
+        compressed_tensors[kConvBiasTensor] = &bias_comp_data;
+        bias_comp_data.scheme = CompressionScheme::kBinQuant;
+        bias_comp_data.data.bin_quant.compressed_bit_width =
+            comp_info->bias_bit_width;
+        bias_comp_data.data.bin_quant.value_table = comp_info->bias_value_table;
+        bias_comp_data.data.bin_quant.value_table_channel_stride =
+            comp_info->bias_value_table_stride;
+        bias_comp_data.data.bin_quant.is_per_channel_quantized = is_per_channel;
+        bias_comp_data.data.bin_quant.use_alternate_axis = false;
+      }
+      comp_list_p = &comp_list;
+    } else {
+      return kTfLiteError;
+    }
+  }
+
+#endif  // USE_TFLM_COMPRESSION
+
+  TfLiteStatus status = InvokeConv(tensors, tensors_size, output_length,
+                                   conv_params, registration, output_data
+#ifdef USE_TFLM_COMPRESSION
+                                   ,
+                                   comp_list_p
+#endif  // USE_TFLM_COMPRESSION
+  );
+  if (status != kTfLiteOk) {
+    return status;
+  }
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
+  }
+  return kTfLiteOk;
+}
+
+template <typename CTF = void, typename CTB = void>
+TfLiteStatus TestConvFloat(
+    int* input_dims_data, const float* input_data, int* filter_dims_data,
+    const float* filter_data, int* bias_dims_data, const float* bias_data,
+    int* output_dims_data, const float* expected_output_data,
+    TfLiteConvParams* conv_params, TFLMRegistration registration,
+    float* output_data
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const TestCompressionInfo<CTF, CTB>* comp_info = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateTensor(input_data, input_dims),
+      CreateTensor(filter_data, filter_dims),
+      CreateTensor(bias_data, bias_dims),
+      CreateTensor(output_data, output_dims),
+  };
+
+  return ValidateConvGoldens(tensors, tensors_size, expected_output_data,
+                             output_dims_count, conv_params, registration,
+                             output_data
+#ifdef USE_TFLM_COMPRESSION
+                             ,
+                             1e-5, comp_info
+#endif  // USE_TFLM_COMPRESSION
+  );
+}
 
 TfLiteStatus TestConvQuantizedPerChannel(
     int* input_dims_data, const float* input_data, int8_t* input_quantized,
@@ -88,6 +234,80 @@ TfLiteStatus TestConvQuantizedPerChannel(
     float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
     TFLMRegistration registration, int16_t* output_data);
 
+#ifdef USE_TFLM_COMPRESSION
+
+template <typename TIO, typename CTB>
+TfLiteStatus TestConvQuantizedPerChannelCompressed(
+    int* input_dims_data, const float* input_data, TIO* input_quantized,
+    float input_scale, int input_zero_point, int* output_dims_data,
+    const float* expected_output_data, TIO* expected_output_quantized,
+    TIO* output_quantized, float output_scale, int output_zero_point,
+    const TfLiteConvParams* conv_params, TFLMRegistration registration,
+    const TestCompressionQuantizedInfo<CTB>* comp_info) {
+  // TODO(ddavis-2015): account for optional bias tensor
+  // bool null_bias = comp_info->bias_data == nullptr ? true : false;
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(comp_info->filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(comp_info->bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  TfLiteFloatArray* filter_scales =
+      FloatArrayFromFloats(comp_info->filter_scales);
+  TfLiteIntArray* filter_zero_points =
+      IntArrayFromInts(comp_info->filter_zero_points);
+  TfLiteFloatArray* bias_scales = FloatArrayFromFloats(comp_info->bias_scales);
+  TfLiteIntArray* bias_zero_points =
+      IntArrayFromInts(comp_info->bias_zero_points);
+
+  TfLiteAffineQuantization filter_quant = {};
+  TfLiteTensor filter_tensor = CreatePerChannelQuantizedTensor(
+      comp_info->filter_compressed, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant, kConvQuantizedDimension,
+      false /* is_variable */, kTfLiteInt8);
+  SymmetricPerChannelQuantize(
+      comp_info->filter_data, comp_info->filter_value_table,
+      ElementCount(*filter_dims), filter_scales->size, filter_scales->data);
+
+  TfLiteAffineQuantization bias_quant = {};
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      comp_info->bias_compressed, bias_dims, input_scale, filter_scales,
+      bias_scales, bias_zero_points, &bias_quant, kConvQuantizedDimension,
+      false /* is_variable */, typeToTfLiteType<CTB>());
+  SymmetricPerChannelQuantize(comp_info->bias_data, comp_info->bias_value_table,
+                              ElementCount(*bias_dims), bias_scales->size,
+                              bias_scales->data);
+
+  for (int i = 0; i < ElementCount(*bias_dims); i++) {
+    int64_t bias_data0 = comp_info->bias_value_table[i];
+    MicroPrintf(
+        "bias scale %f bias zero_point %d"
+        " bias data %f bias data quantized %lld",
+        (double)bias_scales->data[i], bias_zero_points->data[i],
+        (double)comp_info->bias_data[i], bias_data0);
+  }
+
+  constexpr int tensors_size = kMaxTensors;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point),
+      filter_tensor,
+      bias_tensor,
+      CreateQuantizedTensor(output_quantized, output_dims, output_scale,
+                            output_zero_point),
+  };
+
+  const int output_dims_count = ElementCount(*output_dims);
+  Quantize(expected_output_data, expected_output_quantized, output_dims_count,
+           output_scale, output_zero_point);
+  return ValidateConvGoldens(tensors, tensors_size, expected_output_quantized,
+                             output_dims_count, conv_params, registration,
+                             output_quantized, 1.0e-5f /* tolerance */,
+                             comp_info);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/kernels/conv_test_common.cc b/tensorflow/lite/micro/kernels/conv_test_common.cc
index a0f733b8e42..7b6f71a8fc3 100644
--- a/tensorflow/lite/micro/kernels/conv_test_common.cc
+++ b/tensorflow/lite/micro/kernels/conv_test_common.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,108 +18,6 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-template <typename T>
-TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
-                        int output_length, TfLiteConvParams* conv_params,
-                        TFLMRegistration registration, T* output_data) {
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
-                             outputs_array, conv_params);
-
-  const char* init_data = reinterpret_cast<const char*>(conv_params);
-  TfLiteStatus status = runner.InitAndPrepare(init_data);
-  if (status != kTfLiteOk) {
-    return status;
-  }
-  return runner.Invoke();
-}
-
-template <typename T>
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 const T* expected_output_data,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 TFLMRegistration registration, T* output_data,
-                                 float tolerance) {
-  TfLiteStatus status = InvokeConv(tensors, tensors_size, output_length,
-                                   conv_params, registration, output_data);
-  if (status != kTfLiteOk) {
-    return status;
-  }
-  for (int i = 0; i < output_length; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
-                              tolerance);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
-                        int output_length, TfLiteConvParams* conv_params,
-                        TFLMRegistration registration, float* output_data) {
-  return InvokeConv<float>(tensors, tensors_size, output_length, conv_params,
-                           registration, output_data);
-}
-
-TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
-                        int output_length, TfLiteConvParams* conv_params,
-                        TFLMRegistration registration, int8_t* output_data) {
-  return InvokeConv<int8_t>(tensors, tensors_size, output_length, conv_params,
-                            registration, output_data);
-}
-
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 const float* expected_output_data,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 TFLMRegistration registration,
-                                 float* output_data, float tolerance) {
-  return ValidateConvGoldens<float>(tensors, tensors_size, expected_output_data,
-                                    output_length, conv_params, registration,
-                                    output_data, tolerance);
-}
-
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 const int8_t* expected_output_data,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 TFLMRegistration registration,
-                                 int8_t* output_data, float tolerance) {
-  return ValidateConvGoldens<int8_t>(
-      tensors, tensors_size, expected_output_data, output_length, conv_params,
-      registration, output_data, tolerance);
-}
-
-TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data,
-                           int* filter_dims_data, const float* filter_data,
-                           int* bias_dims_data, const float* bias_data,
-                           int* output_dims_data,
-                           const float* expected_output_data,
-                           TfLiteConvParams* conv_params,
-                           TFLMRegistration registration, float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateTensor(input_data, input_dims),
-      CreateTensor(filter_data, filter_dims),
-      CreateTensor(bias_data, bias_dims),
-      CreateTensor(output_data, output_dims),
-  };
-
-  return ValidateConvGoldens(tensors, tensors_size, expected_output_data,
-                             output_dims_count, conv_params, registration,
-                             output_data);
-}
-
 template <typename T, typename BiasT>
 TfLiteStatus TestConvQuantizedPerChannel(
     int* input_dims_data, const float* input_data, T* input_quantized,
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index 65c83792e87..c779ea329f3 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ TfLiteStatus FullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node) {
       (input->type == kTfLiteInt8 &&
        (filter->type != kTfLiteInt8 && filter->type != kTfLiteInt4)) ||
       (input->type == kTfLiteInt16 && filter->type != kTfLiteInt8)) {
-    MicroPrintf("Input type: %s with filter type : %s not supported.",
+    MicroPrintf("Input type: %s with filter type: %s not supported.",
                 TfLiteTypeGetName(input->type),
                 TfLiteTypeGetName(filter->type));
     return kTfLiteError;
@@ -79,6 +79,23 @@ TfLiteStatus FullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node) {
                                  context, params->activation, input->type,
                                  input, filter, bias, output, data));
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // Compression scratch buffers.
+  // These will only be allocated if the tensor is compressed.
+  if (micro_context->IsTensorCompressed(node, kFullyConnectedWeightsTensor) &&
+      filter->type == kTfLiteInt4) {
+    MicroPrintf("Compression not supported with INT4 tensors");
+    return kTfLiteError;
+  }
+  data->weights_scratch_index =
+      micro_context->AllocateDecompressionScratchBuffer(
+          node, kFullyConnectedWeightsTensor);
+  data->bias_scratch_index = micro_context->AllocateDecompressionScratchBuffer(
+      node, kFullyConnectedBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
   micro_context->DeallocateTempTfLiteTensor(input);
   micro_context->DeallocateTempTfLiteTensor(filter);
   if (bias != nullptr) {
@@ -102,8 +119,20 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteEvalTensor* output =
       tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor);
 
-  TFLITE_DCHECK(node->user_data != nullptr);
+#ifdef USE_TFLM_COMPRESSION
+
+  // TODO(ddavis-2015): make micro_context a const pointer
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* weights_comp_td =
+      micro_context->GetTensorCompressionData(node,
+                                              kFullyConnectedWeightsTensor);
+  const CompressionTensorData* bias_comp_td =
+      micro_context->GetTensorCompressionData(node, kFullyConnectedBiasTensor);
 
+#endif  // USE_TFLM_COMPRESSION
+
+  TFLITE_DCHECK(node->user_data != nullptr);
   const auto& data =
       *(static_cast<const OpDataFullyConnected*>(node->user_data));
 
@@ -115,9 +144,18 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<float>(input),
           tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(micro_context, filter,
+                                              weights_comp_td,
+                                              data.weights_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(micro_context, bias, bias_comp_td,
+                                              data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorData<float>(filter),
           tflite::micro::GetTensorShape(bias),
           tflite::micro::GetOptionalTensorData<float>(bias),
+#endif  // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<float>(output));
       break;
@@ -149,9 +187,18 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
               tflite::micro::GetTensorShape(input),
               tflite::micro::GetTensorData<int8_t>(input),
               tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   weights_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetTensorData<int32_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorData<int8_t>(filter),
               tflite::micro::GetTensorShape(bias),
               tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorShape(output),
               tflite::micro::GetTensorData<int8_t>(output));
           break;
@@ -173,9 +220,18 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
               tflite::micro::GetTensorShape(input),
               tflite::micro::GetTensorData<int16_t>(input),
               tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   weights_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetTensorData<int64_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorData<int8_t>(filter),
               tflite::micro::GetTensorShape(bias),
               tflite::micro::GetOptionalTensorData<int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorShape(output),
               tflite::micro::GetTensorData<int16_t>(output));
           break;
diff --git a/tensorflow/lite/micro/kernels/fully_connected.h b/tensorflow/lite/micro/kernels/fully_connected.h
index 8308838ec6d..d7ea705964c 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.h
+++ b/tensorflow/lite/micro/kernels/fully_connected.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -46,6 +46,14 @@ struct OpDataFullyConnected {
   // tensor is of n-bit precision that cannot be easily processed by kernels.
   int filter_buffer_index;
 #endif
+
+#ifdef USE_TFLM_COMPRESSION
+
+  // scratch buffers for compressed tensors
+  int weights_scratch_index;
+  int bias_scratch_index;
+
+#endif  // USE_TFLM_COMPRESSION
 };
 
 extern const int kFullyConnectedInputTensor;
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index 2ad132055b8..b88d5635815 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -42,6 +42,20 @@ const float simple_weights_data[] = {
     1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
 };
 
+#ifdef USE_TFLM_COMPRESSION
+
+// compressed filter data for kBinQuant scheme
+constexpr uint8_t kBinQuantFilterData[] = {0x01, 0x23, 0x45, 0x67, 0x89,
+                                           0x01, 0x23, 0x45, 0x67, 0x89,
+                                           0x01, 0x23, 0x45, 0x67, 0x89};
+constexpr float kBinQuantFilterValueTable[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+constexpr int kBinQuantFilterBitWidth = 4;
+// compressed bias data for kBinQuant scheme
+constexpr uint8_t kBinQuantBiasData[] = {0x18};
+constexpr int kBinQuantBiasBitWidth = 2;
+
+#endif  // USE_TFLM_COMPRESSION
+
 // TODO(b/258710417): INT4 isn't currently supported on Hexagon.
 #if !defined(HEXAGON)
 const float simple_int4_weights_data[] = {
@@ -241,11 +255,50 @@ const float representative_64x16_golden[] = {
 const int representative_64x16_output_size = 16;
 int representative_64x16_output_dims[] = {2, 1, 16};
 
-template <typename T>
+constexpr int kMaxTensors = 4;
+
+#ifdef USE_TFLM_COMPRESSION
+
+template <typename TFILTER, typename TBIAS>
+struct TestCompressionInfo {
+  TFILTER* filter_value_table;
+  size_t filter_value_table_stride;
+  int filter_bit_width;
+  bool use_filter_alt_axis;
+  TBIAS* bias_value_table;
+  size_t bias_value_table_stride;
+  int bias_bit_width;
+  bool use_bias_alt_axis;
+  CompressionScheme scheme;
+};
+
+template <typename TBIAS>
+struct TestCompressionQuantizedInfo : TestCompressionInfo<int8_t, TBIAS> {
+  const uint8_t* filter_compressed;
+  const float* filter_data;
+  const int* filter_dims_data;    // TfLiteIntArray
+  const float* filter_scales;     // TfLiteFloatArray
+  const int* filter_zero_points;  // TfLiteIntArray
+
+  const uint8_t* bias_compressed;
+  const float* bias_data;
+  const int* bias_dims_data;  // TfLiteIntArray
+  float* bias_scales;         // TfLiteFloatArray (computed)
+  int* bias_zero_points;      // TfLiteIntArray (computed)
+};
+
+#endif  // USE_TFLM_COMPRESSION
+
+template <typename T, typename CTF = void, typename CTB = void>
 TfLiteStatus ValidateFullyConnectedGoldens(
     TfLiteTensor* tensors, const int tensors_size, bool null_bias,
     const TfLiteFusedActivation activation, const float tolerance,
-    const int output_len, const T* golden, T* output_data) {
+    const int output_len, const T* golden, T* output_data
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const TestCompressionInfo<CTF, CTB>* comp_info = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
   TfLiteFullyConnectedParams builtin_data = {
       activation, kTfLiteFullyConnectedWeightsFormatDefault, false, false,
       kTfLiteNoType};
@@ -272,10 +325,56 @@ TfLiteStatus ValidateFullyConnectedGoldens(
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
+#ifdef USE_TFLM_COMPRESSION
+
+  CompressionTensorData* compressed_tensors[kMaxTensors] = {};
+  CompressionTensorData filter_comp_data = {};
+  CompressionTensorData bias_comp_data = {};
+  CompressedTensorList comp_list = {compressed_tensors};
+  CompressedTensorList* comp_list_p = nullptr;
+
+  if (comp_info != nullptr) {
+    if (comp_info->scheme == CompressionScheme::kBinQuant) {
+      if (comp_info->filter_value_table != nullptr) {
+        compressed_tensors[kFullyConnectedWeightsTensor] = &filter_comp_data;
+        filter_comp_data.scheme = CompressionScheme::kBinQuant;
+        filter_comp_data.data.bin_quant.compressed_bit_width =
+            comp_info->filter_bit_width;
+        filter_comp_data.data.bin_quant.value_table =
+            comp_info->filter_value_table;
+        filter_comp_data.data.bin_quant.value_table_channel_stride =
+            comp_info->filter_value_table_stride;
+        filter_comp_data.data.bin_quant.is_per_channel_quantized = false;
+        filter_comp_data.data.bin_quant.use_alternate_axis = false;
+      }
+      if (comp_info->bias_value_table != nullptr) {
+        compressed_tensors[kFullyConnectedBiasTensor] = &bias_comp_data;
+        bias_comp_data.scheme = CompressionScheme::kBinQuant;
+        bias_comp_data.data.bin_quant.compressed_bit_width =
+            comp_info->bias_bit_width;
+        bias_comp_data.data.bin_quant.value_table = comp_info->bias_value_table;
+        bias_comp_data.data.bin_quant.value_table_channel_stride =
+            comp_info->bias_value_table_stride;
+        bias_comp_data.data.bin_quant.is_per_channel_quantized = false;
+        bias_comp_data.data.bin_quant.use_alternate_axis = false;
+      }
+      comp_list_p = &comp_list;
+    } else {
+      return kTfLiteError;
+    }
+  }
+
+#endif  // USE_TFLM_COMPRESSION
+
   const TFLMRegistration registration = Register_FULLY_CONNECTED();
   micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
                              outputs_array,
-                             reinterpret_cast<void*>(&builtin_data));
+                             reinterpret_cast<void*>(&builtin_data), nullptr
+#ifdef USE_TFLM_COMPRESSION
+                             ,
+                             comp_list_p
+#endif  // USE_TFLM_COMPRESSION
+  );
 
   TfLiteStatus status = runner.InitAndPrepare();
   if (status != kTfLiteOk) {
@@ -293,11 +392,17 @@ TfLiteStatus ValidateFullyConnectedGoldens(
   return kTfLiteOk;
 }
 
+template <typename CTF = void, typename CTB = void>
 TfLiteStatus TestFullyConnectedFloat(
     int* input_dims_data, const float* input_data, int* weights_dims_data,
     const float* weights_data, int* bias_dims_data, const float* bias_data,
     const float* golden, int* output_dims_data,
-    TfLiteFusedActivation activation, float* output_data) {
+    TfLiteFusedActivation activation, float* output_data
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const TestCompressionInfo<CTF, CTB>* comp_info = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
@@ -305,16 +410,15 @@ TfLiteStatus TestFullyConnectedFloat(
   const int output_dims_count = ElementCount(*output_dims);
   bool null_bias = bias_data == nullptr ? true : false;
 
-  constexpr int array_size = 4;  // Avoid variable length array warning.
-  const int inputs_size = bias_data == nullptr ? 2 : 3;
+  const int inputs_size = null_bias ? 2 : 3;
   constexpr int outputs_size = 1;
   const int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[array_size];
+  TfLiteTensor tensors[kMaxTensors];
 
   tensors[0] = CreateTensor(input_data, input_dims);
   tensors[1] = CreateTensor(weights_data, weights_dims);
 
-  if (bias_data == nullptr) {
+  if (null_bias) {
     tensors[2] = CreateTensor(output_data, output_dims);
   } else {
     tensors[2] = CreateTensor(bias_data, bias_dims);
@@ -323,7 +427,12 @@ TfLiteStatus TestFullyConnectedFloat(
 
   return ValidateFullyConnectedGoldens(tensors, tensors_size, null_bias,
                                        activation, 1e-4f, output_dims_count,
-                                       golden, output_data);
+                                       golden, output_data
+#ifdef USE_TFLM_COMPRESSION
+                                       ,
+                                       comp_info
+#endif  // USE_TFLM_COMPRESSION
+  );
 }
 
 template <typename dataT, typename weightT, typename biasT>
@@ -345,7 +454,7 @@ TfLiteStatus TestFullyConnectedQuantized(
   bool null_bias = bias_data == nullptr ? true : false;
 
   constexpr int array_size = 4;  // Avoid variable length array warning.
-  const int inputs_size = bias_data == nullptr ? 2 : 3;
+  const int inputs_size = null_bias ? 2 : 3;
   constexpr int outputs_size = 1;
   const int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[array_size];
@@ -355,7 +464,7 @@ TfLiteStatus TestFullyConnectedQuantized(
   tensors[1] = CreateQuantizedTensor(
       weights_data, weights_quantized, weights_dims, weights_scale,
       weights_zero_point, false, weights_packed_type);
-  if (bias_data == nullptr) {
+  if (null_bias) {
     tensors[2] = CreateQuantizedTensor(output_data, output_dims, output_scale,
                                        output_zero_point);
   } else {
@@ -373,6 +482,71 @@ TfLiteStatus TestFullyConnectedQuantized(
                                        golden_quantized, output_data);
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+template <typename TIO, typename CTB>
+TfLiteStatus TestFullyConnectedQuantizedCompressed(
+    int* input_dims_data, const float* input_data, TIO* input_quantized,
+    float input_scale, int input_zero_point, int* output_dims_data,
+    const float* expected_output_data, TIO* expected_output_quantized,
+    TIO* output_quantized, float output_scale, int output_zero_point,
+    const TfLiteFusedActivation activation,
+    const TestCompressionQuantizedInfo<CTB>* comp_info) {
+  // TODO(ddavis-2015): account for optional bias tensor
+
+  bool null_bias = comp_info->bias_data == nullptr ? true : false;
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(comp_info->filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(comp_info->bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  TfLiteFloatArray* filter_scales =
+      FloatArrayFromFloats(comp_info->filter_scales);
+  TfLiteIntArray* filter_zero_points =
+      IntArrayFromInts(comp_info->filter_zero_points);
+
+  TfLiteTensor filter_tensor = CreateQuantizedTensor(
+      comp_info->filter_compressed, filter_dims, filter_scales->data[0],
+      filter_zero_points->data[0], false, kTfLiteInt8);
+  SymmetricQuantize(comp_info->filter_data, comp_info->filter_value_table,
+                    ElementCount(*filter_dims), filter_scales->data[0]);
+
+  TfLiteTensor bias_tensor = CreateQuantizedTensor(
+      comp_info->bias_compressed, bias_dims,
+      input_scale * filter_scales->data[0], 0, false, typeToTfLiteType<CTB>());
+  SymmetricQuantize(comp_info->bias_data, comp_info->bias_value_table,
+                    ElementCount(*bias_dims), bias_tensor.params.scale);
+
+  for (int i = 0; i < ElementCount(*bias_dims); i++) {
+    int64_t bias_data0 = comp_info->bias_value_table[i];
+    MicroPrintf(
+        "bias scale %f bias zero_point %d"
+        " bias data %f bias data quantized %lld",
+        (double)bias_tensor.params.scale, bias_tensor.params.zero_point,
+        (double)comp_info->bias_data[i], bias_data0);
+  }
+
+  constexpr int tensors_size = kMaxTensors;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point),
+      filter_tensor,
+      bias_tensor,
+      CreateQuantizedTensor(output_quantized, output_dims, output_scale,
+                            output_zero_point),
+  };
+
+  const int output_dims_count = ElementCount(*output_dims);
+  Quantize(expected_output_data, expected_output_quantized, output_dims_count,
+           output_scale, output_zero_point);
+  return ValidateFullyConnectedGoldens(
+      tensors, tensors_size, null_bias, activation, 0.0f, output_dims_count,
+      expected_output_quantized, output_quantized, comp_info);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
@@ -393,6 +567,37 @@ TF_LITE_MICRO_TEST(SimpleTest) {
       kTfLiteOk);
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestCompressed) {
+  float output_data[tflite::testing::simple_output_size];
+
+  tflite::testing::TestCompressionInfo<const float, const float> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = tflite::testing::kBinQuantFilterValueTable;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(tflite::testing::simple_weights_data)>::value;
+  comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth;
+  comp_info.bias_value_table = tflite::testing::simple_bias_data;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::simple_bias_data)>::value;
+  comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedFloat(
+          tflite::testing::simple_input_dims,
+          tflite::testing::simple_input_data,
+          tflite::testing::simple_weights_dims,
+          reinterpret_cast<const float*>(tflite::testing::kBinQuantFilterData),
+          tflite::testing::simple_bias_dims,
+          reinterpret_cast<const float*>(tflite::testing::kBinQuantBiasData),
+          tflite::testing::simple_golden, tflite::testing::simple_output_dims,
+          kTfLiteActNone, output_data, &comp_info),
+      kTfLiteOk);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 TF_LITE_MICRO_TEST(SimpleTestNullBias) {
   float output_data[tflite::testing::simple_output_size];
   TF_LITE_MICRO_EXPECT_EQ(
@@ -434,6 +639,54 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) {
       kTfLiteOk);
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Compressed) {
+  const float input_scale = 1.0f;
+  const int input_zero_point = -1;
+  constexpr float weights_scale[] = {1, 1.0f};
+  constexpr int weights_zero_point[] = {1, 0};
+  const float output_scale = 0.5f;
+  const int output_zero_point = -1;
+
+  int8_t input_quantized[tflite::testing::simple_input_size];
+  int8_t weights_quantized[tflite::testing::simple_weights_size];
+  int32_t bias_quantized[tflite::testing::simple_output_size];
+  int8_t golden_quantized[tflite::testing::simple_output_size];
+  int8_t output_data[tflite::testing::simple_output_size];
+
+  tflite::testing::TestCompressionQuantizedInfo<int32_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = weights_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(tflite::testing::kBinQuantFilterValueTable)>::value;
+  comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth;
+  comp_info.filter_compressed = tflite::testing::kBinQuantFilterData;
+  comp_info.filter_data = tflite::testing::kBinQuantFilterValueTable;
+  comp_info.filter_dims_data = tflite::testing::simple_weights_dims;
+  comp_info.filter_scales = weights_scale;
+  comp_info.filter_zero_points = weights_zero_point;
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::simple_bias_data)>::value;
+  comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth;
+  comp_info.bias_compressed = tflite::testing::kBinQuantBiasData;
+  comp_info.bias_data = tflite::testing::simple_bias_data;
+  comp_info.bias_dims_data = tflite::testing::simple_bias_dims;
+  // bias_scales and bias_zero_points are not used
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantizedCompressed(
+          tflite::testing::simple_input_dims,
+          tflite::testing::simple_input_data, input_quantized, input_scale,
+          input_zero_point, tflite::testing::simple_output_dims,
+          tflite::testing::simple_golden, golden_quantized, output_data,
+          output_scale, output_zero_point, kTfLiteActNone, &comp_info),
+      kTfLiteOk);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 #if !defined(HEXAGON)
 TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16) {
   const float input_scale = 128.0 / 65536;
@@ -443,7 +696,6 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16) {
   const float output_scale = 128.0 / 65536;
   const int output_zero_point = 0;
 
-  const float simple_golden[] = {24, 25, 26, 58, 59, 60};
   int16_t input_quantized[tflite::testing::simple_input_size];
   int8_t weights_quantized[tflite::testing::simple_weights_size];
   int64_t bias_quantized[tflite::testing::simple_output_size];
@@ -457,12 +709,62 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16) {
           input_zero_point, tflite::testing::simple_weights_dims,
           tflite::testing::simple_weights_data, weights_quantized,
           weights_scale, weights_zero_point, tflite::testing::simple_bias_dims,
-          tflite::testing::simple_bias_data, bias_quantized, simple_golden,
-          golden_quantized, tflite::testing::simple_output_dims, output_scale,
-          output_zero_point, kTfLiteActNone, output_data),
+          tflite::testing::simple_bias_data, bias_quantized,
+          tflite::testing::simple_golden, golden_quantized,
+          tflite::testing::simple_output_dims, output_scale, output_zero_point,
+          kTfLiteActNone, output_data),
       kTfLiteOk);
 }
-#endif
+
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16Compressed) {
+  const float input_scale = 128.0 / 65536;
+  const int input_zero_point = 0;
+  constexpr float weights_scale[] = {1, 1.0f};
+  constexpr int weights_zero_point[] = {1, 0};
+  const float output_scale = 128.0 / 65536;
+  const int output_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::simple_input_size];
+  int8_t weights_quantized[tflite::testing::simple_weights_size];
+  int64_t bias_quantized[tflite::testing::simple_output_size];
+  int16_t golden_quantized[tflite::testing::simple_output_size];
+  int16_t output_data[tflite::testing::simple_output_size];
+
+  tflite::testing::TestCompressionQuantizedInfo<int64_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = weights_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(tflite::testing::kBinQuantFilterValueTable)>::value;
+  comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth;
+  comp_info.filter_compressed = tflite::testing::kBinQuantFilterData;
+  comp_info.filter_data = tflite::testing::kBinQuantFilterValueTable;
+  comp_info.filter_dims_data = tflite::testing::simple_weights_dims;
+  comp_info.filter_scales = weights_scale;
+  comp_info.filter_zero_points = weights_zero_point;
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::simple_bias_data)>::value;
+  comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth;
+  comp_info.bias_compressed = tflite::testing::kBinQuantBiasData;
+  comp_info.bias_data = tflite::testing::simple_bias_data;
+  comp_info.bias_dims_data = tflite::testing::simple_bias_dims;
+  // bias_scales and bias_zero_points are not used
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantizedCompressed(
+          tflite::testing::simple_input_dims,
+          tflite::testing::simple_input_data, input_quantized, input_scale,
+          input_zero_point, tflite::testing::simple_output_dims,
+          tflite::testing::simple_golden, golden_quantized, output_data,
+          output_scale, output_zero_point, kTfLiteActNone, &comp_info),
+      kTfLiteOk);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
+#endif  // !defined(HEXAGON)
 
 TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) {
   const float input_scale = 1.0f;
diff --git a/tensorflow/lite/micro/kernels/kernel_runner.cc b/tensorflow/lite/micro/kernels/kernel_runner.cc
index 602778d7c50..da797d03aa3 100644
--- a/tensorflow/lite/micro/kernels/kernel_runner.cc
+++ b/tensorflow/lite/micro/kernels/kernel_runner.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -38,12 +38,22 @@ KernelRunner::KernelRunner(const TFLMRegistration& registration,
                            TfLiteTensor* tensors, int tensors_size,
                            TfLiteIntArray* inputs, TfLiteIntArray* outputs,
                            const void* builtin_data,
-                           TfLiteIntArray* intermediates)
+                           TfLiteIntArray* intermediates
+#ifdef USE_TFLM_COMPRESSION
+                           ,
+                           const CompressedTensorList* compressed_tensors
+#endif  // USE_TFLM_COMPRESSION
+                           )
     : registration_(registration),
       allocator_(SingleArenaBufferAllocator::Create(kKernelRunnerBuffer_,
                                                     kKernelRunnerBufferSize_)),
       mock_micro_graph_(allocator_),
-      fake_micro_context_(tensors, allocator_, &mock_micro_graph_) {
+      fake_micro_context_(tensors, allocator_, &mock_micro_graph_
+#ifdef USE_TFLM_COMPRESSION
+                          ,
+                          compressed_tensors
+#endif  // USE_TFLM_COMPRESSION
+      ) {
   // Prepare TfLiteContext:
   context_.impl_ = static_cast<void*>(&fake_micro_context_);
   context_.ReportError = MicroContextReportOpError;
diff --git a/tensorflow/lite/micro/kernels/kernel_runner.h b/tensorflow/lite/micro/kernels/kernel_runner.h
index 25b97c11302..8dbd7f8b015 100644
--- a/tensorflow/lite/micro/kernels/kernel_runner.h
+++ b/tensorflow/lite/micro/kernels/kernel_runner.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,7 +36,12 @@ class KernelRunner {
   KernelRunner(const TFLMRegistration& registration, TfLiteTensor* tensors,
                int tensors_size, TfLiteIntArray* inputs,
                TfLiteIntArray* outputs, const void* builtin_data,
-               TfLiteIntArray* intermediates = nullptr);
+               TfLiteIntArray* intermediates = nullptr
+#ifdef USE_TFLM_COMPRESSION
+               ,
+               const CompressedTensorList* compressed_tensors = nullptr
+#endif  // USE_TFLM_COMPRESSION
+  );
 
   // Calls init and prepare on the kernel (i.e. TFLMRegistration) struct.
   // Any exceptions will be DebugLog'd and returned as a status code.
diff --git a/tensorflow/lite/micro/kernels/kernel_util.h b/tensorflow/lite/micro/kernels/kernel_util.h
index f14c927133d..977ed9563e1 100644
--- a/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/tensorflow/lite/micro/kernels/kernel_util.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -91,6 +91,31 @@ const T* GetOptionalTensorData(const TfLiteEvalTensor* tensor) {
                            : reinterpret_cast<const T*>(tensor->data.raw);
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+// Overloads existing GetTensorData. If not compressed, this will return
+// tensor->data.
+//
+// TODO(ddavis-2015): make micro_context a const pointer
+template <typename T>
+const T* GetTensorData(MicroContext* micro_context,
+                       const TfLiteEvalTensor* tensor,
+                       const CompressionTensorData* compression_data,
+                       int scratch_buffer_handle) {
+  if (tensor == nullptr) {
+    return nullptr;
+  }
+  if (compression_data == nullptr) {
+    return reinterpret_cast<const T*>(tensor->data.data);
+  }
+
+  void* uncompressed_data = micro_context->DecompressTensorToScratchBuffer(
+      *tensor, *compression_data, scratch_buffer_handle);
+  return reinterpret_cast<const T*>(uncompressed_data);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 // Returns the shape of a TfLiteEvalTensor struct.
 const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor);
 
diff --git a/tensorflow/lite/micro/kernels/transpose_conv.cc b/tensorflow/lite/micro/kernels/transpose_conv.cc
index ea0efae0607..7932f290f81 100644
--- a/tensorflow/lite/micro/kernels/transpose_conv.cc
+++ b/tensorflow/lite/micro/kernels/transpose_conv.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -51,6 +51,14 @@ struct OpData {
   // A scratch buffer is required for quantized implementations.
   int scratch_buffer_index;
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // scratch buffers for compressed tensors
+  int filter_scratch_index;
+  int bias_scratch_index;
+
+#endif  // USE_TFLM_COMPRESSION
+
   // Index to the converted 64-bit bias buffer from 16-bit bias. This is
   // required to handle 16x8 transpose convolutions where a 16-bit bias is
   // provided, whereas the kernel expects 64-bit biases.
@@ -244,6 +252,17 @@ TfLiteStatus TransposeConvPrepare(TfLiteContext* context, TfLiteNode* node) {
   data->params.stride_width = params->stride_width;
   data->params.stride_height = params->stride_height;
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // Compression scratch buffers.
+  // These will only be allocated if the tensor is compressed.
+  data->filter_scratch_index =
+      micro_context->AllocateDecompressionScratchBuffer(node, kFilterTensor);
+  data->bias_scratch_index =
+      micro_context->AllocateDecompressionScratchBuffer(node, kBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
   micro_context->DeallocateTempTfLiteTensor(output);
   micro_context->DeallocateTempTfLiteTensor(input);
   micro_context->DeallocateTempTfLiteTensor(filter);
@@ -262,6 +281,18 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteEvalTensor* output =
       tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // TODO(ddavis-2015): make micro_context a const pointer
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* filter_comp_td =
+      micro_context->GetTensorCompressionData(node, kFilterTensor);
+  const CompressionTensorData* bias_comp_td =
+      micro_context->GetTensorCompressionData(node, kBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
@@ -280,9 +311,17 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) {
           op_params, tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<float>(input),
           tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(
+              micro_context, filter, filter_comp_td, data.filter_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(micro_context, bias, bias_comp_td,
+                                              data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorData<float>(filter),
           tflite::micro::GetTensorShape(bias),
           tflite::micro::GetOptionalTensorData<float>(bias),
+#endif  // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<float>(output),
           tflite::micro::GetTensorShape(nullptr), nullptr);
@@ -296,9 +335,17 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) {
           data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<int8_t>(input),
           tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int8_t>(
+              micro_context, filter, filter_comp_td, data.filter_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(
+              micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorData<int8_t>(filter),
           tflite::micro::GetTensorShape(bias),
           tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output),
           tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
@@ -311,16 +358,29 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) {
         auto* bias_converted_buffer =
             static_cast<int64_t*>(context->GetScratchBuffer(
                 context, data.bias_converted_buffer_index));
+        const int16_t* const bias_int16_data =
+#ifdef USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int16_t>(
+                micro_context, bias, bias_comp_td, data.bias_scratch_index);
+#else   // USE_TFLM_COMPRESSION
+            static_cast<int16_t*>(bias->data.data);
+#endif  // USE_TFLM_COMPRESSION
         for (int i = 0; i < tflite::micro::GetTensorShape(bias).FlatSize();
              i++) {
-          bias_converted_buffer[i] = bias->data.i16[i];
+          bias_converted_buffer[i] = bias_int16_data[i];
         }
         reference_integer_ops::TransposeConv(
             data.params, data.per_channel_output_multiplier,
             data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
             tflite::micro::GetTensorData<int16_t>(input),
             tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                 filter_comp_td,
+                                                 data.filter_scratch_index),
+#else   // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorData<int8_t>(filter),
+#endif  // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorShape(bias), bias_converted_buffer,
             tflite::micro::GetTensorShape(output),
             tflite::micro::GetTensorData<int16_t>(output),
@@ -331,9 +391,18 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) {
             data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
             tflite::micro::GetTensorData<int16_t>(input),
             tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                 filter_comp_td,
+                                                 data.filter_scratch_index),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetTensorData<int64_t>(
+                micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorData<int8_t>(filter),
             tflite::micro::GetTensorShape(bias),
-            tflite::micro::GetOptionalTensorData<std::int64_t>(bias),
+            tflite::micro::GetOptionalTensorData<int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
             tflite::micro::GetTensorShape(output),
             tflite::micro::GetTensorData<int16_t>(output),
             tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
diff --git a/tensorflow/lite/micro/kernels/transpose_conv_test.cc b/tensorflow/lite/micro/kernels/transpose_conv_test.cc
index 49d2c90f439..64dded4dba0 100644
--- a/tensorflow/lite/micro/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/transpose_conv_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+#include <type_traits>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/kernels/conv_test.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/test_helpers.h"
@@ -25,28 +27,88 @@ namespace tflite {
 namespace testing {
 namespace {
 
+constexpr float kTolerance = 1e-5;
+
 // Common inputs and outputs.
 constexpr int kInputElements = 32;
 static int kInputShape[] = {4, 1, 4, 4, 2};
-static const float kInputData[kInputElements] = {
+static constexpr float kInputData[kInputElements] = {
     1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
 
 constexpr int kFilterElements = 18;
 static int kFilterShape[] = {4, 1, 3, 3, 2};
-static const float kFilterData[kFilterElements] = {
+static constexpr float kFilterData[kFilterElements] = {
     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
 
 constexpr int kBiasElements = 1;
 static int kBiasShape[] = {4, 1, 1, 1, 1};
-static const float kBiasData[kBiasElements] = {0};
+static constexpr float kBiasData[kBiasElements] = {0};
 
 constexpr int kOutputElements = 16;
 static int kOutputShape[] = {4, 1, 4, 4, 1};
-static const float kGoldenData[kOutputElements] = {
+static constexpr float kGoldenData[kOutputElements] = {
     184,  412,  568,  528,  678,  1347, 1689, 1434,
     1494, 2715, 3057, 2442, 1968, 3352, 3652, 2760};
 
+// Common inputs and outputs (quantized single channel).
+// data from TfLite test: SimpleBiasTestQuantizedPerChannelSingleChannel
+constexpr int kInputElementsQ1 = 16;
+static int kInputShapeQ1[] = {4, 1, 4, 4, 1};
+static constexpr float kInputDataQ1[kInputElementsQ1] = {
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+
+constexpr int kFilterElementsQ1 = 9;
+static int kFilterShapeQ1[] = {4, 1, 3, 3, 1};
+static constexpr float kFilterDataQ1[kFilterElementsQ1] = {1, 2, 3, 4, 5,
+                                                           6, 7, 8, 9};
+
+constexpr int kBiasElementsQ1 = 1;
+static int kBiasShapeQ1[] = {1, 1};
+static constexpr float kBiasDataQ1[kBiasElementsQ1] = {1};
+
+constexpr int kOutputElementsQ1 = 16;
+static int kOutputShapeQ1[] = {4, 1, 4, 4, 1};
+static constexpr float kGoldenDataQ1[kOutputElementsQ1] = {
+    30, 62, 84, 76, 100, 192, 238, 198, 206, 372, 416, 330, 262, 446, 484, 366};
+
+// Common inputs and outputs (quantized multi channel).
+// data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64
+constexpr int kInputElementsQ2 = 12;
+static int kInputShapeQ2[] = {4, 1, 2, 3, 2};
+static constexpr float kInputDataQ2[kInputElementsQ2] = {
+    // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+    3,  2,   // batch = 0, y = 0, x = 0
+    1,  -1,  // batch = 0, y = 0, x = 1
+    -2, -3,  // batch = 0, y = 0, x = 2
+    4,  3,   // batch = 0, y = 1, x = 0
+    2,  -2,  // batch = 0, y = 1, x = 1
+    -3, -4,  // batch = 0, y = 1, x = 2
+};
+
+constexpr int kFilterElementsQ2 = 16;
+static int kFilterShapeQ2[] = {4, 2, 2, 2, 2};
+static constexpr float kFilterDataQ2[kFilterElementsQ2] = {
+    // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+    1, 2,  // out channel = 0, y = 0, x = 0
+    3, 4,  // out channel = 0, y = 0, x = 1
+    3, 4,  // out channel = 0, y = 1, x = 0
+    5, 6,  // out channel = 0, y = 1, x = 1
+    7, 8,  // out channel = 1, y = 0, x = 0
+    5, 6,  // out channel = 1, y = 0, x = 1
+    3, 4,  // out channel = 1, y = 1, x = 0
+    1, 2,  // out channel = 1, y = 1, x = 1
+};
+
+constexpr int kBiasElementsQ2 = 2;
+static int kBiasShapeQ2[] = {1, 2};
+static constexpr float kBiasDataQ2[kBiasElementsQ2] = {3, -2};
+
+constexpr int kOutputElementsQ2 = 12;
+static int kOutputShapeQ2[] = {4, 1, 2, 3, 2};
+static constexpr float kGoldenDataQ2[kOutputElementsQ2] = {
+    10, 35, 19, 24, -6, -41, 30, 64, 51, 40, -29, -64};
+
 // Transpose conv uses TfLiteConvParams.
 static TfLiteConvParams common_conv_params = {kTfLitePaddingSame,  // padding
                                               1,  // stride_width
@@ -56,19 +118,114 @@ static TfLiteConvParams common_conv_params = {kTfLitePaddingSame,  // padding
                                               1,
                                               kTfLiteNoType};
 
-template <typename T>
-TfLiteStatus InvokeTransposeConv(TfLiteTensor* tensors, int tensors_size,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 T* output_data) {
+// Compression inputs and associated data
+constexpr int kMaxTensors = 5;
+constexpr int kOutputTensor = 4;  // physical index
+
+#ifdef USE_TFLM_COMPRESSION
+
+constexpr int kFilterTensor = 1;  // physical index
+constexpr int kBiasTensor = 3;    // physical index
+
+template <typename TFILTER, typename TBIAS>
+struct TestCompressionInfo {
+  TFILTER* filter_value_table;
+  size_t filter_value_table_stride;
+  int filter_bit_width;
+  bool use_filter_alt_axis;
+  TBIAS* bias_value_table;
+  size_t bias_value_table_stride;
+  int bias_bit_width;
+  bool use_bias_alt_axis;
+  CompressionScheme scheme;
+};
+
+template <typename TBIAS>
+struct TestCompressionQuantizedInfo : TestCompressionInfo<int8_t, TBIAS> {
+  const uint8_t* filter_compressed;
+  const float* filter_data;
+  const int* filter_dims_data;    // TfLiteIntArray
+  const float* filter_scales;     // TfLiteFloatArray
+  const int* filter_zero_points;  // TfLiteIntArray
+
+  const uint8_t* bias_compressed;
+  const float* bias_data;
+  const int* bias_dims_data;  // TfLiteIntArray
+  float* bias_scales;         // TfLiteFloatArray (computed)
+  int* bias_zero_points;      // TfLiteIntArray (computed)
+};
+
+#endif  // USE_TFLM_COMPRESSION
+
+template <typename CTF = void, typename CTB = void>
+TfLiteStatus InvokeTransposeConv(
+    TfLiteTensor* tensors, int tensors_size, const TfLiteConvParams* conv_params
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const TestCompressionInfo<CTF, CTB>* comp_info = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
   int inputs_array_data[] = {4, 0, 1, 2, 3};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 4};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  // TODO(ddavis-2015): account for optional bias tensor
+
+#ifdef USE_TFLM_COMPRESSION
+
+  CompressionTensorData* compressed_tensors[kMaxTensors] = {};
+  CompressionTensorData filter_comp_data = {};
+  CompressionTensorData bias_comp_data = {};
+  CompressedTensorList comp_list = {compressed_tensors};
+  CompressedTensorList* comp_list_p = nullptr;
+
+  if (comp_info != nullptr) {
+    if (comp_info->scheme == CompressionScheme::kBinQuant) {
+      bool is_per_channel_quantized =
+          std::is_same<CTF, float>::value ? false : true;
+      if (comp_info->filter_value_table != nullptr) {
+        compressed_tensors[kFilterTensor] = &filter_comp_data;
+        filter_comp_data.scheme = CompressionScheme::kBinQuant;
+        filter_comp_data.data.bin_quant.compressed_bit_width =
+            comp_info->filter_bit_width;
+        filter_comp_data.data.bin_quant.value_table =
+            comp_info->filter_value_table;
+        filter_comp_data.data.bin_quant.value_table_channel_stride =
+            comp_info->filter_value_table_stride;
+        filter_comp_data.data.bin_quant.is_per_channel_quantized =
+            is_per_channel_quantized;
+        filter_comp_data.data.bin_quant.use_alternate_axis =
+            comp_info->use_filter_alt_axis;
+      }
+      if (comp_info->bias_value_table != nullptr) {
+        compressed_tensors[kBiasTensor] = &bias_comp_data;
+        bias_comp_data.scheme = CompressionScheme::kBinQuant;
+        bias_comp_data.data.bin_quant.compressed_bit_width =
+            comp_info->bias_bit_width;
+        bias_comp_data.data.bin_quant.value_table = comp_info->bias_value_table;
+        bias_comp_data.data.bin_quant.value_table_channel_stride =
+            comp_info->bias_value_table_stride;
+        bias_comp_data.data.bin_quant.is_per_channel_quantized =
+            is_per_channel_quantized;
+        bias_comp_data.data.bin_quant.use_alternate_axis =
+            comp_info->use_bias_alt_axis;
+      }
+      comp_list_p = &comp_list;
+    } else {
+      return kTfLiteError;
+    }
+  }
+
+#endif  // USE_TFLM_COMPRESSION
 
   const TFLMRegistration registration = tflite::Register_TRANSPOSE_CONV();
   micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
-                             outputs_array, conv_params);
+                             outputs_array, conv_params, nullptr
+#ifdef USE_TFLM_COMPRESSION
+                             ,
+                             comp_list_p
+#endif  // USE_TFLM_COMPRESSION
+  );
 
   const char* init_data = reinterpret_cast<const char*>(conv_params);
   TfLiteStatus status = runner.InitAndPrepare(init_data);
@@ -78,43 +235,65 @@ TfLiteStatus InvokeTransposeConv(TfLiteTensor* tensors, int tensors_size,
   return runner.Invoke();
 }
 
-template <typename T>
-TfLiteStatus ValidateTransposeConvGoldens(TfLiteTensor* tensors,
-                                          int tensors_size,
-                                          const T* expected_output_data,
-                                          int output_length,
-                                          TfLiteConvParams* conv_params,
-                                          T* output_data, float tolerance) {
-  TfLiteStatus status = InvokeTransposeConv(
-      tensors, tensors_size, output_length, conv_params, output_data);
+template <typename T, typename CTF = void, typename CTB = void>
+TfLiteStatus ValidateTransposeConvGoldens(
+    TfLiteTensor* tensors, int tensors_size, const float* expected_output_data,
+    int output_length, float* output_data, T* output_quantized,
+    TfLiteConvParams* conv_params, float tolerance
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const TestCompressionInfo<CTF, CTB>* comp_info = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
+  TfLiteStatus status = InvokeTransposeConv(tensors, tensors_size, conv_params
+#ifdef USE_TFLM_COMPRESSION
+                                            ,
+                                            comp_info
+#endif  // USE_TFLM_COMPRESSION
+  );
   if (status != kTfLiteOk) {
     return status;
   }
+
+  if (output_quantized != nullptr) {
+    // TODO(ddavis-2015): account for optional bias tensor
+    const float scale = tensors[kOutputTensor].params.scale;
+    const int zero_point = tensors[kOutputTensor].params.zero_point;
+    Dequantize(output_quantized, output_length, scale, zero_point, output_data);
+    MicroPrintf("Dequantize: scale %f zero_point %d length %d", (double)scale,
+                zero_point, output_length);
+  }
   for (int i = 0; i < output_length; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
                               tolerance);
   }
+
   return kTfLiteOk;
 }
 
+template <typename CTF = void, typename CTB = void>
 TfLiteStatus TestTransposeConvFloat(
     int* input_dims_data, const float* input_data, int* filter_dims_data,
     const float* filter_data, int* bias_dims_data, const float* bias_data,
     int* output_dims_data, const float* expected_output_data,
-    TfLiteConvParams* conv_params, float* output_data) {
+    TfLiteConvParams* conv_params, float* output_data
+#ifdef USE_TFLM_COMPRESSION
+    ,
+    const TestCompressionInfo<CTF, CTB>* comp_info = nullptr
+#endif  // USE_TFLM_COMPRESSION
+) {
+  // TODO(ddavis-2015): account for optional bias tensor
+
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
 
   int output_shape_dims_data[] = {1, 0};
   int32_t* output_shape = nullptr;
   TfLiteIntArray* output_shape_dims = IntArrayFromInts(output_shape_dims_data);
 
-  constexpr int inputs_size = 4;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
+  constexpr int tensors_size = kMaxTensors;
   TfLiteTensor tensors[tensors_size] = {
       CreateTensor(output_shape, output_shape_dims),
       CreateTensor(filter_data, filter_dims),
@@ -123,110 +302,205 @@ TfLiteStatus TestTransposeConvFloat(
       CreateTensor(output_data, output_dims),
   };
 
-  return ValidateTransposeConvGoldens(tensors, tensors_size,
-                                      expected_output_data, output_dims_count,
-                                      conv_params, output_data, 0.001f);
+  const int output_dims_count = ElementCount(*output_dims);
+  return ValidateTransposeConvGoldens<float>(
+      tensors, tensors_size, expected_output_data, output_dims_count,
+      output_data, nullptr, conv_params, kTolerance
+#ifdef USE_TFLM_COMPRESSION
+      ,
+      comp_info
+#endif  // USE_TFLM_COMPRESSION
+  );
 }
 
+template <typename TBIAS, typename TIO>
 TfLiteStatus TestTransposeConvQuantized(
-    int* input_dims_data, const float* input_data, int8_t* input_quantized,
+    int* input_dims_data, const float* input_data, TIO* input_quantized,
     float input_scale, int input_zero_point, int* filter_dims_data,
-    const float* filter_data, int8_t* filter_quantized, float filter_scale,
-    int* bias_dims_data, const float* bias_data, int32_t* bias_quantized,
-    float* bias_scales, int* bias_zero_points, int* output_dims_data,
-    const float* expected_output_data, int8_t* expected_output_quantized,
-    float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
-    int8_t* output_data) {
+    const float* filter_data, int8_t* filter_quantized, int* bias_dims_data,
+    const float* bias_data, TBIAS* bias_quantized, int* output_dims_data,
+    const float* expected_output_data, float* output_data,
+    TIO* output_quantized, float output_scale, int output_zero_point,
+    TfLiteConvParams* conv_params) {
+  // TODO(ddavis-2015): account for optional bias tensor
+
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
 
   int filter_zero_points[5];
-  float filter_scales[5];
+  float filter_scales[std::extent<decltype(filter_zero_points)>::value];
   TfLiteAffineQuantization filter_quant;
+  TF_LITE_MICRO_EXPECT_LE(static_cast<size_t>(filter_dims->data[0]),
+                          std::extent<decltype(filter_zero_points)>::value - 1);
+  TF_LITE_MICRO_CHECK_FAIL();
   TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
       filter_data, filter_quantized, filter_dims, filter_scales,
       filter_zero_points, &filter_quant, 0 /* quantized dimension */);
-  tflite::Quantize(expected_output_data, expected_output_quantized,
-                   output_dims_count, output_scale, 0);
+  MicroPrintf(
+      "input scale %f filter scale %f filter zero_point %d filter size %d %d"
+      " filter qp %p %p filter data %f filter data quantized %d",
+      (double)input_scale, (double)filter_quant.scale->data[0],
+      filter_quant.zero_point->data[0], filter_quant.scale->size,
+      filter_quant.zero_point->size, &filter_quant,
+      filter_tensor.quantization.params, (double)filter_data[0],
+      filter_quantized[0]);
+
+  int bias_zero_points[std::extent<decltype(filter_zero_points)>::value];
+  float bias_scales[std::extent<decltype(filter_scales)>::value];
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor bias_tensor = {};
+  // TODO(ddavis-2015): cleanup
+  if (filter_quant.scale->size > 0) {
+    bias_tensor = CreatePerChannelQuantizedBiasTensor(
+        bias_data, bias_quantized, bias_dims, input_scale,
+        filter_quant.scale->data, bias_scales, bias_zero_points, &bias_quant,
+        0 /* quantized dimension */);
+    int64_t bias_data0 = bias_quantized[0];
+    MicroPrintf(
+        "bias scale %f bias zero_point %d bias size %d %d bias qp %p %p"
+        " bias data %f bias data quantized %lld",
+        (double)bias_quant.scale->data[0], bias_quant.zero_point->data[0],
+        bias_quant.scale->size, bias_quant.zero_point->size, &bias_quant,
+        bias_tensor.quantization.params, (double)bias_data[0], bias_data0);
+  } else {
+    bias_tensor =
+        CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims,
+                                  input_scale, filter_quant.scale->data[0]);
+
+    int64_t bias_data0 = bias_quantized[0];
+    MicroPrintf(
+        "bias scale %f bias zero_point %d bias qp %p bias data %f bias data "
+        "quantized %lld",
+        (double)bias_tensor.params.scale, bias_tensor.params.zero_point,
+        bias_tensor.quantization.params, (double)bias_data[0], bias_data0);
+  }
 
   int output_shape_dims_data[] = {1, 0};
   int32_t* output_shape = nullptr;
   TfLiteIntArray* output_shape_dims = IntArrayFromInts(output_shape_dims_data);
 
-  constexpr int inputs_size = 4;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
+  constexpr int tensors_size = kMaxTensors;
   TfLiteTensor tensors[tensors_size] = {
-      CreateTensor(output_shape, output_shape_dims), filter_tensor,
+      CreateTensor(output_shape, output_shape_dims),
+      filter_tensor,
       CreateQuantizedTensor(input_data, input_quantized, input_dims,
                             input_scale, input_zero_point),
-      CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims,
-                                input_scale, filter_scale),
-      CreateQuantizedTensor(output_data, output_dims, output_scale,
-                            output_zero_point)};
+      bias_tensor,
+      CreateQuantizedTensor(output_quantized, output_dims, output_scale,
+                            output_zero_point),
+  };
 
+  // TODO(ddavis-2015): investigate why the tolerance differs from the TfLite
+  // tests which use 1e-5
+  //
+  // Tolerance is slightly looser for 8x16 compared with float, since quant
+  // error is more pronounced on the finer-grained 16-bit output.
+  constexpr float tolerance = std::is_same<TIO, int8_t>::value ? 2.0f : 4.0f;
+  const int output_dims_count = ElementCount(*output_dims);
   return ValidateTransposeConvGoldens(
-      tensors, tensors_size, expected_output_quantized, output_dims_count,
-      conv_params, output_data, 1.0f);
+      tensors, tensors_size, expected_output_data, output_dims_count,
+      output_data, output_quantized, conv_params, tolerance);
 }
 
-template <typename T>
-TfLiteStatus TestTransposeConvQuantized(
-    int* input_dims_data, const float* input_data, int16_t* input_quantized,
-    float input_scale, int input_zero_point, int* filter_dims_data,
-    const float* filter_data, int8_t* filter_quantized, float filter_scale,
-    int* bias_dims_data, const float* bias_data, T* bias_quantized,
-    float* bias_scales, int* bias_zero_points, int* output_dims_data,
-    const float* expected_output_data, int16_t* expected_output_quantized,
-    float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
-    int16_t* output_data) {
+#ifdef USE_TFLM_COMPRESSION
+
+template <typename TIO, typename CTB>
+TfLiteStatus TestTransposeConvQuantizedCompressed(
+    int* input_dims_data, const float* input_data, TIO* input_quantized,
+    float input_scale, int input_zero_point, int* output_dims_data,
+    const float* expected_output_data, float* output_data,
+    TIO* output_quantized, float output_scale, int output_zero_point,
+    TfLiteConvParams* conv_params,
+    const TestCompressionQuantizedInfo<CTB>* comp_info) {
+  // TODO(ddavis-2015): account for optional bias tensor
+  MicroPrintf("%s", __PRETTY_FUNCTION__);
+
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(comp_info->filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(comp_info->bias_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
 
-  int filter_zero_points[5];
-  float filter_scales[5];
-  TfLiteAffineQuantization filter_quant;
-  TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
-      filter_data, filter_quantized, filter_dims, filter_scales,
-      filter_zero_points, &filter_quant, 0 /* quantized dimension */);
-  tflite::Quantize(expected_output_data, expected_output_quantized,
-                   output_dims_count, output_scale, 0);
+  TfLiteFloatArray* filter_scales =
+      FloatArrayFromFloats(comp_info->filter_scales);
+  TfLiteIntArray* filter_zero_points =
+      IntArrayFromInts(comp_info->filter_zero_points);
+  TfLiteFloatArray* bias_scales = FloatArrayFromFloats(comp_info->bias_scales);
+  TfLiteIntArray* bias_zero_points =
+      IntArrayFromInts(comp_info->bias_zero_points);
+
+  size_t quantized_axis;
+
+  TfLiteAffineQuantization filter_quant_params;
+  quantized_axis = comp_info->use_filter_alt_axis ? 3 : 0;
+  TfLiteTensor filter_tensor = CreatePerChannelQuantizedTensor(
+      comp_info->filter_compressed, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant_params, quantized_axis, false,
+      kTfLiteInt8);
+  SymmetricPerChannelQuantize(
+      comp_info->filter_data, comp_info->filter_value_table,
+      ElementCount(*filter_dims), filter_dims->data[quantized_axis],
+      filter_scales->data);
+
+  TfLiteAffineQuantization bias_quant_params;
+  quantized_axis = comp_info->use_bias_alt_axis ? 3 : 0;
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      comp_info->bias_compressed, bias_dims, input_scale, filter_scales,
+      bias_scales, bias_zero_points, &bias_quant_params, quantized_axis, false,
+      typeToTfLiteType<CTB>());
+  SymmetricPerChannelQuantize(comp_info->bias_data, comp_info->bias_value_table,
+                              ElementCount(*bias_dims),
+                              bias_dims->data[quantized_axis],
+                              bias_scales->data);
+  for (int i = 0; i < bias_scales->size; i++) {
+    int64_t bias_data0 = comp_info->bias_value_table[i];
+    MicroPrintf(
+        "bias scale %f bias zero_point %d bias size %d %d bias qp %p %p"
+        " bias data %f bias data quantized %lld",
+        (double)bias_quant_params.scale->data[i],
+        bias_quant_params.zero_point->data[i], bias_quant_params.scale->size,
+        bias_quant_params.zero_point->size, &bias_quant_params,
+        bias_tensor.quantization.params, (double)comp_info->bias_data[i],
+        bias_data0);
+  }
 
   int output_shape_dims_data[] = {1, 0};
   int32_t* output_shape = nullptr;
   TfLiteIntArray* output_shape_dims = IntArrayFromInts(output_shape_dims_data);
 
-  constexpr int inputs_size = 4;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
+  constexpr int tensors_size = kMaxTensors;
   TfLiteTensor tensors[tensors_size] = {
-      CreateTensor(output_shape, output_shape_dims), filter_tensor,
+      CreateTensor(output_shape, output_shape_dims),
+      filter_tensor,
       CreateQuantizedTensor(input_data, input_quantized, input_dims,
                             input_scale, input_zero_point),
-      CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims,
-                                input_scale, filter_scale),
-      CreateQuantizedTensor(output_data, output_dims, output_scale,
-                            output_zero_point)};
+      bias_tensor,
+      CreateQuantizedTensor(output_quantized, output_dims, output_scale,
+                            output_zero_point),
+  };
 
+  // TODO(ddavis-2015): why is int8 tolerance so large?
+  //
   // Tolerance is slightly looser for 8x16 compared with float, since quant
   // error is more pronounced on the finer-grained 16-bit output.
+  constexpr float tolerance = std::is_same<TIO, int8_t>::value ? 2.0f : 0.19f;
+  const int output_dims_count = ElementCount(*output_dims);
   return ValidateTransposeConvGoldens(
-      tensors, tensors_size, expected_output_quantized, output_dims_count,
-      conv_params, output_data, 4.0f);
+      tensors, tensors_size, expected_output_data, output_dims_count,
+      output_data, output_quantized, conv_params, tolerance, comp_info);
 }
 
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
 
 TF_LITE_MICRO_TESTS_BEGIN
 
+// TODO(ddavis-2015): add tests with no bias tensor
+
 TF_LITE_MICRO_TEST(SimpleTestFloat) {
   float output_data[tflite::testing::kOutputElements];
 
@@ -240,6 +514,44 @@ TF_LITE_MICRO_TEST(SimpleTestFloat) {
           &tflite::testing::common_conv_params, output_data));
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestFloatCompressed) {
+  float output_data[tflite::testing::kOutputElements];
+
+  // compressed filter data for kBinQuant scheme
+  constexpr uint8_t kBinQuantFilterData[] = {
+      0x00, 0x44, 0x32, 0x14, 0xC7, 0x42, 0x54, 0xB6, 0x35, 0xCF, 0x84, 0x40};
+  constexpr int kBinQuantFilterBitWidth = 5;
+  // compressed bias data for kBinQuant scheme
+  constexpr uint8_t kBinQuantBiasData[] = {0x00};
+  constexpr int kBinQuantBiasBitWidth = 1;
+
+  tflite::testing::TestCompressionInfo<const float, const float> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = tflite::testing::kFilterData;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(tflite::testing::kFilterData)>::value;
+  comp_info.filter_bit_width = kBinQuantFilterBitWidth;
+  comp_info.bias_value_table = tflite::testing::kBiasData;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasData)>::value;
+  comp_info.bias_bit_width = kBinQuantBiasBitWidth;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestTransposeConvFloat(
+          tflite::testing::kInputShape, tflite::testing::kInputData,
+          tflite::testing::kFilterShape,
+          reinterpret_cast<const float*>(kBinQuantFilterData),
+          tflite::testing::kBiasShape,
+          reinterpret_cast<const float*>(kBinQuantBiasData),
+          tflite::testing::kOutputShape, tflite::testing::kGoldenData,
+          &tflite::testing::common_conv_params, output_data, &comp_info));
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 TF_LITE_MICRO_TEST(fusedRELUTest) {
   float output_data[tflite::testing::kOutputElements];
   float golden_data[] = {29,  24,  0, 0, 99,  72,  0,   0,
@@ -317,21 +629,27 @@ TF_LITE_MICRO_TEST(MultiChannelBiasWithFusedActivationTest) {
           bias_data, output_shape, golden_data, &conv_params, output_data));
 }
 
+#ifdef notdef
+// TODO(ddavis-2015): remove
 TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
-  int8_t output_data[tflite::testing::kOutputElements];
-
-  const float input_scale = 0.5f;
-  const float output_scale = 30.0f;
-  const float filter_scale = 1.0f;
-  const int input_zero_point = 0;
-  const int output_zero_point = 0;
-
   int8_t input_quantized[tflite::testing::kInputElements];
   int8_t filter_quantized[tflite::testing::kFilterElements];
   int32_t bias_quantized[tflite::testing::kBiasElements];
-  int8_t golden_quantized[tflite::testing::kOutputElements];
-  int zero_points[tflite::testing::kBiasElements + 1];
-  float scales[tflite::testing::kBiasElements + 1];
+  int8_t output_quantized[tflite::testing::kOutputElements];
+  float output_data[tflite::testing::kOutputElements];
+
+  auto mm = std::minmax_element(std::begin(tflite::testing::kInputData),
+                                std::end(tflite::testing::kInputData));
+  const float input_scale =
+      tflite::testing::ScaleFromMinMax<int8_t>(*mm.first, *mm.second);
+  const int input_zero_point =
+      tflite::testing::ZeroPointFromMinMax<int8_t>(*mm.first, *mm.second);
+  mm = std::minmax_element(std::begin(tflite::testing::kGoldenData),
+                           std::end(tflite::testing::kGoldenData));
+  const float output_scale =
+      tflite::testing::ScaleFromMinMax<int8_t>(*mm.first, *mm.second);
+  const int output_zero_point =
+      tflite::testing::ZeroPointFromMinMax<int8_t>(*mm.first, *mm.second);
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
@@ -339,28 +657,303 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
           tflite::testing::kInputShape, tflite::testing::kInputData,
           input_quantized, input_scale, input_zero_point,
           tflite::testing::kFilterShape, tflite::testing::kFilterData,
-          filter_quantized, filter_scale, tflite::testing::kBiasShape,
-          tflite::testing::kBiasData, bias_quantized, scales, zero_points,
+          filter_quantized, tflite::testing::kBiasShape,
+          tflite::testing::kBiasData, bias_quantized,
           tflite::testing::kOutputShape, tflite::testing::kGoldenData,
-          golden_quantized, output_scale, output_zero_point,
-          &tflite::testing::common_conv_params, output_data));
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params));
 }
+#endif
+
+TF_LITE_MICRO_TEST(SimpleBiasTestQuantizedPerChannelSingleChannel) {
+  // data from TfLite test: SimpleBiasTestQuantizedPerChannelSingleChannel
+  const float input_scale = 16.0f / 255.0f;
+  const float output_scale = 2.0f;
+  const int input_zero_point = -128;
+  const int output_zero_point = -128;
+
+  int8_t input_quantized[tflite::testing::kInputElementsQ1];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ1];
+  int32_t bias_quantized[tflite::testing::kBiasElementsQ1];
+  int8_t output_quantized[tflite::testing::kOutputElementsQ1];
+  float output_data[tflite::testing::kOutputElementsQ1];
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestTransposeConvQuantized(
+          tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kFilterShapeQ1, tflite::testing::kFilterDataQ1,
+          filter_quantized, tflite::testing::kBiasShapeQ1,
+          tflite::testing::kBiasDataQ1, bias_quantized,
+          tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1,
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params));
+}
+
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelSingleChannelCompressed) {
+  // data from TfLite test: SimpleBiasTestQuantizedPerChannelSingleChannel
+  const float input_scale = 16.0f / 255.0f;
+  const float output_scale = 2.0f;
+  const int input_zero_point = -128;
+  const int output_zero_point = -128;
+
+  constexpr float kFilterScales[] = {1, 9.0f / 127.0f};
+  constexpr int kFilterZeroPoints[] = {1, 0};
+  // all values will be computed
+  float kBiasScales[std::extent<decltype(kFilterScales)>::value] = {};
+  // all values will be computed
+  int kBiasZeroPoints[std::extent<decltype(kFilterZeroPoints)>::value] = {};
+
+  int8_t input_quantized[tflite::testing::kInputElementsQ1];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ1];
+  int32_t bias_quantized[tflite::testing::kBiasElementsQ1];
+  int8_t output_quantized[tflite::testing::kOutputElementsQ1];
+  float output_data[tflite::testing::kOutputElementsQ1];
+
+  // compressed filter data for kBinQuant scheme
+  constexpr uint8_t kBinQuantFilterData[] = {0x01, 0x23, 0x45, 0x67, 0x80};
+  constexpr int kBinQuantFilterBitWidth = 4;
+  // compressed bias data for kBinQuant scheme
+  constexpr uint8_t kBinQuantBiasData[] = {0x00};
+  constexpr int kBinQuantBiasBitWidth = 1;
+
+  tflite::testing::TestCompressionQuantizedInfo<int32_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = filter_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(tflite::testing::kFilterDataQ1)>::value;
+  comp_info.filter_bit_width = kBinQuantFilterBitWidth;
+  comp_info.filter_compressed = kBinQuantFilterData;
+  comp_info.filter_data = tflite::testing::kFilterDataQ1;
+  comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1;
+  comp_info.filter_scales = kFilterScales;
+  comp_info.filter_zero_points = kFilterZeroPoints;
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasDataQ1)>::value;
+  comp_info.bias_bit_width = kBinQuantBiasBitWidth;
+  comp_info.bias_compressed = kBinQuantBiasData;
+  comp_info.bias_data = tflite::testing::kBiasDataQ1;
+  comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1;
+  comp_info.bias_scales = kBiasScales;
+  comp_info.bias_zero_points = kBiasZeroPoints;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestTransposeConvQuantizedCompressed(
+          tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1,
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params, &comp_info));
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleBiasTestQuantizedPerChannelBias16MultiChannel) {
+  // data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64
+  const float input_scale = 4.0f / 127.0f;
+  const float output_scale = 128.0f / 65536.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::kInputElementsQ2];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ2];
+  int16_t bias_quantized[tflite::testing::kBiasElementsQ2];
+  int16_t output_quantized[tflite::testing::kOutputElementsQ2];
+  float output_data[tflite::testing::kOutputElementsQ2];
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestTransposeConvQuantized(
+          tflite::testing::kInputShapeQ2, tflite::testing::kInputDataQ2,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kFilterShapeQ2, tflite::testing::kFilterDataQ2,
+          filter_quantized, tflite::testing::kBiasShapeQ2,
+          tflite::testing::kBiasDataQ2, bias_quantized,
+          tflite::testing::kOutputShapeQ2, tflite::testing::kGoldenDataQ2,
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params));
+}
+
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(
+    SimpleBiasTestQuantizedPerChannelBias16MultiChannelCompressed) {
+  // data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64
+  const float input_scale = 4.0f / 127.0f;
+  const float output_scale = 128.0f / 65536.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
 
-TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel) {
-  int16_t output_data[tflite::testing::kOutputElements];
+  constexpr int kNumChannels = 2;
 
+  constexpr float kFilterScales[] = {kNumChannels, 7.0f / 127.0f,
+                                     8.0f / 127.0f};
+  constexpr int kFilterZeroPoints[] = {kNumChannels, 0, 0};
+  // all values will be computed
+  float kBiasScales[std::extent<decltype(kFilterScales)>::value] = {};
+  // all values will be computed
+  int kBiasZeroPoints[std::extent<decltype(kFilterZeroPoints)>::value] = {};
+
+  int16_t input_quantized[tflite::testing::kInputElementsQ2];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ2];
+  int16_t bias_quantized[tflite::testing::kBiasElementsQ2];
+  int16_t output_quantized[tflite::testing::kOutputElementsQ2];
+  float output_data[tflite::testing::kOutputElementsQ2];
+
+  // compressed filter data for kBinQuant scheme
+  constexpr uint8_t kBinQuantFilterData[] = {0x05, 0x34, 0xE5,
+                                             0xDE, 0x54, 0xC1};
+  constexpr float kBinQuantFilterValueTable[] = {1, 2, 3, 4, 5, 6, 0, 0,
+                                                 1, 2, 3, 4, 5, 6, 7, 8};
+  constexpr int kBinQuantFilterBitWidth = 3;
+  // compressed bias data for kBinQuant scheme
+  constexpr uint8_t kBinQuantBiasData[] = {0x00};
+  constexpr int kBinQuantBiasBitWidth = 1;
+
+  tflite::testing::TestCompressionQuantizedInfo<int16_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = filter_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(kBinQuantFilterValueTable)>::value / kNumChannels;
+  comp_info.filter_bit_width = kBinQuantFilterBitWidth;
+  comp_info.filter_compressed = kBinQuantFilterData;
+  comp_info.filter_data = kBinQuantFilterValueTable;
+  comp_info.filter_dims_data = tflite::testing::kFilterShapeQ2;
+  comp_info.filter_scales = kFilterScales;
+  comp_info.filter_zero_points = kFilterZeroPoints;
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasDataQ2)>::value / kNumChannels;
+  comp_info.bias_bit_width = kBinQuantBiasBitWidth;
+  comp_info.bias_compressed = kBinQuantBiasData;
+  comp_info.bias_data = tflite::testing::kBiasDataQ2;
+  comp_info.bias_dims_data = tflite::testing::kBiasShapeQ2;
+  comp_info.bias_scales = kBiasScales;
+  comp_info.bias_zero_points = kBiasZeroPoints;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestTransposeConvQuantizedCompressed(
+          tflite::testing::kInputShapeQ2, tflite::testing::kInputDataQ2,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kOutputShapeQ2, tflite::testing::kGoldenDataQ2,
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params, &comp_info));
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleBiasTestQuantizedPerChannelBias64MultiChannel) {
+  // data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64
+  const float input_scale = 4.0f / 127.0f;
+  const float output_scale = 128.0f / 65536.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::kInputElementsQ2];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ2];
+  int64_t bias_quantized[tflite::testing::kBiasElementsQ2];
+  int16_t output_quantized[tflite::testing::kOutputElementsQ2];
+  float output_data[tflite::testing::kOutputElementsQ2];
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestTransposeConvQuantized(
+          tflite::testing::kInputShapeQ2, tflite::testing::kInputDataQ2,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kFilterShapeQ2, tflite::testing::kFilterDataQ2,
+          filter_quantized, tflite::testing::kBiasShapeQ2,
+          tflite::testing::kBiasDataQ2, bias_quantized,
+          tflite::testing::kOutputShapeQ2, tflite::testing::kGoldenDataQ2,
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params));
+}
+
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(
+    SimpleBiasTestQuantizedPerChannelBias64MultiChannelCompressed) {
+  // data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64
+  const float input_scale = 4.0f / 127.0f;
+  const float output_scale = 128.0f / 65536.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  constexpr int kNumChannels = 2;
+
+  constexpr float kFilterScales[] = {kNumChannels, 7.0f / 127.0f,
+                                     8.0f / 127.0f};
+  constexpr int kFilterZeroPoints[] = {kNumChannels, 0, 0};
+  // all values will be computed
+  float kBiasScales[std::extent<decltype(kFilterScales)>::value] = {};
+  // all values will be computed
+  int kBiasZeroPoints[std::extent<decltype(kFilterZeroPoints)>::value] = {};
+
+  int16_t input_quantized[tflite::testing::kInputElementsQ2];
+  int8_t filter_quantized[tflite::testing::kFilterElementsQ2];
+  int64_t bias_quantized[tflite::testing::kBiasElementsQ2];
+  int16_t output_quantized[tflite::testing::kOutputElementsQ2];
+  float output_data[tflite::testing::kOutputElementsQ2];
+
+  // compressed filter data for kBinQuant scheme
+  constexpr uint8_t kBinQuantFilterData[] = {0x05, 0x34, 0xE5,
+                                             0xDE, 0x54, 0xC1};
+  constexpr float kBinQuantFilterValueTable[] = {1, 2, 3, 4, 5, 6, 0, 0,
+                                                 1, 2, 3, 4, 5, 6, 7, 8};
+  constexpr int kBinQuantFilterBitWidth = 3;
+  // compressed bias data for kBinQuant scheme
+  constexpr uint8_t kBinQuantBiasData[] = {0x00};
+  constexpr int kBinQuantBiasBitWidth = 2;
+
+  tflite::testing::TestCompressionQuantizedInfo<int64_t> comp_info = {};
+  comp_info.scheme = tflite::CompressionScheme::kBinQuant;
+  comp_info.filter_value_table = filter_quantized;
+  comp_info.filter_value_table_stride =
+      std::extent<decltype(kBinQuantFilterValueTable)>::value / kNumChannels;
+  comp_info.filter_bit_width = kBinQuantFilterBitWidth;
+  comp_info.filter_compressed = kBinQuantFilterData;
+  comp_info.filter_data = kBinQuantFilterValueTable;
+  comp_info.filter_dims_data = tflite::testing::kFilterShapeQ2;
+  comp_info.filter_scales = kFilterScales;
+  comp_info.filter_zero_points = kFilterZeroPoints;
+  comp_info.bias_value_table = bias_quantized;
+  comp_info.bias_value_table_stride =
+      std::extent<decltype(tflite::testing::kBiasDataQ2)>::value / kNumChannels;
+  comp_info.bias_bit_width = kBinQuantBiasBitWidth;
+  comp_info.bias_compressed = kBinQuantBiasData;
+  comp_info.bias_data = tflite::testing::kBiasDataQ2;
+  comp_info.bias_dims_data = tflite::testing::kBiasShapeQ2;
+  comp_info.bias_scales = kBiasScales;
+  comp_info.bias_zero_points = kBiasZeroPoints;
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::testing::TestTransposeConvQuantizedCompressed(
+          tflite::testing::kInputShapeQ2, tflite::testing::kInputDataQ2,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::kOutputShapeQ2, tflite::testing::kGoldenDataQ2,
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params, &comp_info));
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannelSingleChannel) {
   const float input_scale = 1.0f;
   const float output_scale = 1.0f;
-  const float filter_scale = 1.0f;
   const int input_zero_point = 0;
   const int output_zero_point = 0;
 
   int16_t input_quantized[tflite::testing::kInputElements];
   int8_t filter_quantized[tflite::testing::kFilterElements];
-  std::int64_t bias_quantized[tflite::testing::kBiasElements];
-  int16_t golden_quantized[tflite::testing::kOutputElements];
-  int zero_points[tflite::testing::kBiasElements + 1];
-  float scales[tflite::testing::kBiasElements + 1];
+  int64_t bias_quantized[tflite::testing::kBiasElements];
+  int16_t output_quantized[tflite::testing::kOutputElements];
+  float output_data[tflite::testing::kOutputElements];
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
@@ -368,28 +961,25 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel) {
           tflite::testing::kInputShape, tflite::testing::kInputData,
           input_quantized, input_scale, input_zero_point,
           tflite::testing::kFilterShape, tflite::testing::kFilterData,
-          filter_quantized, filter_scale, tflite::testing::kBiasShape,
-          tflite::testing::kBiasData, bias_quantized, scales, zero_points,
+          filter_quantized, tflite::testing::kBiasShape,
+          tflite::testing::kBiasData, bias_quantized,
           tflite::testing::kOutputShape, tflite::testing::kGoldenData,
-          golden_quantized, output_scale, output_zero_point,
-          &tflite::testing::common_conv_params, output_data));
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params));
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannelWithInt16Bias) {
-  int16_t output_data[tflite::testing::kOutputElements];
-
+TF_LITE_MICRO_TEST(
+    SimpleTestQuantized16x8PerChannelWithInt16BiasSingleChannel) {
   const float input_scale = 1.0f;
   const float output_scale = 1.0f;
-  const float filter_scale = 1.0f;
   const int input_zero_point = 0;
   const int output_zero_point = 0;
 
   int16_t input_quantized[tflite::testing::kInputElements];
   int8_t filter_quantized[tflite::testing::kFilterElements];
   int16_t bias_quantized[tflite::testing::kBiasElements];
-  int16_t golden_quantized[tflite::testing::kOutputElements];
-  int zero_points[tflite::testing::kBiasElements + 1];
-  float scales[tflite::testing::kBiasElements + 1];
+  int16_t output_quantized[tflite::testing::kOutputElements];
+  float output_data[tflite::testing::kOutputElements];
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
@@ -397,11 +987,11 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannelWithInt16Bias) {
           tflite::testing::kInputShape, tflite::testing::kInputData,
           input_quantized, input_scale, input_zero_point,
           tflite::testing::kFilterShape, tflite::testing::kFilterData,
-          filter_quantized, filter_scale, tflite::testing::kBiasShape,
-          tflite::testing::kBiasData, bias_quantized, scales, zero_points,
+          filter_quantized, tflite::testing::kBiasShape,
+          tflite::testing::kBiasData, bias_quantized,
           tflite::testing::kOutputShape, tflite::testing::kGoldenData,
-          golden_quantized, output_scale, output_zero_point,
-          &tflite::testing::common_conv_params, output_data));
+          output_data, output_quantized, output_scale, output_zero_point,
+          &tflite::testing::common_conv_params));
 }
 
 TF_LITE_MICRO_TEST(InputOutputDifferentTypeIsError) {
@@ -413,7 +1003,6 @@ TF_LITE_MICRO_TEST(InputOutputDifferentTypeIsError) {
   TfLiteIntArray* filter_dims = IntArrayFromInts(tflite::testing::kFilterShape);
   TfLiteIntArray* bias_dims = IntArrayFromInts(tflite::testing::kBiasShape);
   TfLiteIntArray* output_dims = IntArrayFromInts(tflite::testing::kOutputShape);
-  const int output_dims_count = tflite::ElementCount(*output_dims);
   constexpr int inputs_size = 4;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
@@ -433,9 +1022,9 @@ TF_LITE_MICRO_TEST(InputOutputDifferentTypeIsError) {
                             /*zero_point=*/0),
   };
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, tflite::testing::InvokeTransposeConv(
-                        tensors, tensors_size, output_dims_count,
-                        &tflite::testing::common_conv_params, output_data));
+      kTfLiteError,
+      tflite::testing::InvokeTransposeConv(
+          tensors, tensors_size, &tflite::testing::common_conv_params));
 }
 
 TF_LITE_MICRO_TEST(HybridModeIsError) {
@@ -447,7 +1036,6 @@ TF_LITE_MICRO_TEST(HybridModeIsError) {
   TfLiteIntArray* filter_dims = IntArrayFromInts(tflite::testing::kFilterShape);
   TfLiteIntArray* bias_dims = IntArrayFromInts(tflite::testing::kBiasShape);
   TfLiteIntArray* output_dims = IntArrayFromInts(tflite::testing::kOutputShape);
-  const int output_dims_count = tflite::ElementCount(*output_dims);
 
   constexpr int inputs_size = 4;
   constexpr int outputs_size = 1;
@@ -471,9 +1059,9 @@ TF_LITE_MICRO_TEST(HybridModeIsError) {
   };
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, tflite::testing::InvokeTransposeConv(
-                        tensors, tensors_size, output_dims_count,
-                        &tflite::testing::common_conv_params, output_data));
+      kTfLiteError,
+      tflite::testing::InvokeTransposeConv(
+          tensors, tensors_size, &tflite::testing::common_conv_params));
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 930da754bb5..f90bb2d62c0 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,6 +36,15 @@ limitations under the License.
 #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
+#ifdef USE_TFLM_COMPRESSION
+
+#include <algorithm>
+#include <cstring>
+
+#include "tensorflow/lite/micro/compression/metadata_generated.h"
+
+#endif  // USE_TFLM_COMPRESSION
+
 namespace tflite {
 
 namespace {
@@ -355,6 +364,149 @@ TfLiteStatus InitializeTfLiteEvalTensorFromFlatbuffer(
   return kTfLiteOk;
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+const tflite::micro::compression::Metadata* GetCompressionMetadata(
+    const Model& model) {
+  const auto metadata_vector = model.metadata();
+  if (metadata_vector == nullptr) {
+    return nullptr;
+  }
+  auto buffers = model.buffers();
+  if (buffers == nullptr) {
+    return nullptr;
+  }
+  const size_t metadata_string_length = std::strlen(kCompressionMetadataString);
+  for (size_t metadata_index = 0; metadata_index < metadata_vector->size();
+       metadata_index++) {
+    auto metadata = metadata_vector->Get(metadata_index);
+    if (metadata->name() == nullptr || metadata->name()->size() == 0) {
+      continue;
+    }
+    const char* s = metadata->name()->c_str();
+    if ((metadata->name()->size() == metadata_string_length) &&
+        (std::strncmp(s, kCompressionMetadataString, metadata_string_length) ==
+         0)) {
+      auto buffer_index = metadata->buffer();
+      if (buffer_index == 0 || buffer_index >= buffers->size()) {
+        MicroPrintf("Compression: Invalid buffer index %u", buffer_index);
+        continue;
+      }
+      auto vp = buffers->Get(buffer_index)->data();
+      if (vp == nullptr || vp->data() == nullptr) {
+        MicroPrintf("Compression: Invalid data for buffer index %u",
+                    buffer_index);
+        continue;
+      }
+      // TODO(ddavis-2015): support multiple compression methods
+      auto compression_metadata =
+          tflite::micro::compression::GetSizePrefixedMetadata(vp);
+      flatbuffers::Verifier verifier(vp->data(), vp->size(),
+                                     flatbuffers::Verifier::Options());
+      if (!tflite::micro::compression::VerifyMetadataBuffer(verifier)) {
+        MicroPrintf("Compression: verification failure");
+        return nullptr;
+      } else {
+        return compression_metadata;
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+TfLiteStatus InitializeCompressionTensorDataFromFlatbuffer(
+    const Model& model, const tflite::micro::compression::LutTensor& lut_tensor,
+    CompressionTensorData* ctd) {
+  ctd->scheme = CompressionScheme::kBinQuant;
+
+  const size_t subgraph_index = lut_tensor.subgraph();
+  if (subgraph_index >= model.subgraphs()->size()) {
+    MicroPrintf("Compression: invalid subgraph index %u in LutTensor",
+                subgraph_index);
+    return kTfLiteError;
+  }
+  const size_t tensor_index = lut_tensor.tensor();
+  auto tensors = model.subgraphs()->Get(subgraph_index)->tensors();
+  if (tensor_index >= tensors->size()) {
+    MicroPrintf("Compression: invalid tensor index %u in LutTensor",
+                tensor_index);
+    return kTfLiteError;
+  }
+  const size_t index_bit_width = lut_tensor.index_bitwidth();
+  if (index_bit_width > BinQuantData::kMaxBitWidth) {
+    MicroPrintf("Compression: invalid bit width %u in LutTensor",
+                index_bit_width);
+    return kTfLiteError;
+  }
+  ctd->data.bin_quant.compressed_bit_width = index_bit_width;
+  const size_t value_buffer_index = lut_tensor.value_buffer();
+  if (value_buffer_index >= model.buffers()->size()) {
+    MicroPrintf("Compression: invalid value_buffer %u in LutTensor",
+                value_buffer_index);
+    return kTfLiteError;
+  }
+  auto value_buffer = model.buffers()->Get(value_buffer_index)->data();
+  if (value_buffer == nullptr || value_buffer->data() == nullptr) {
+    MicroPrintf("Compression: invalid value table for value_buffer %u",
+                value_buffer_index);
+    return kTfLiteError;
+  }
+  ctd->data.bin_quant.value_table = value_buffer->data();
+  auto tensor =
+      model.subgraphs()->Get(subgraph_index)->tensors()->Get(tensor_index);
+  if (tensor->shape() == nullptr) {
+    MicroPrintf("Compression: scalar tensors not supported");
+    return kTfLiteError;
+  }
+  if (tensor->buffer() != lut_tensor.index_buffer()) {
+    MicroPrintf("Compression: mismatched index_buffer %u != %u in LutTensor",
+                lut_tensor.index_buffer(), tensor->buffer());
+    return kTfLiteError;
+  }
+  TfLiteType tensor_type = kTfLiteNoType;
+  TfLiteStatus status = ConvertTensorType(tensor->type(), &tensor_type);
+  if (status != kTfLiteOk) {
+    MicroPrintf("Compression: failed to convert tensor type");
+    return kTfLiteError;
+  }
+  size_t tensor_type_size = 0;
+  status = TfLiteTypeSizeOf(tensor_type, &tensor_type_size);
+  if (status != kTfLiteOk) {
+    MicroPrintf("Compression: failed to get tensor type size");
+    return kTfLiteError;
+  }
+  if (tensor->quantization() != nullptr &&
+      tensor->quantization()->scale() != nullptr &&
+      tensor->quantization()->scale()->size() > 1) {
+    const size_t num_channels = tensor->quantization()->scale()->size();
+    ctd->data.bin_quant.is_per_channel_quantized = true;
+    const TfLiteIntArray* dims =
+        FlatBufferVectorToTfLiteTypeArray(tensor->shape());
+    int32_t quantized_axis = tensor->quantization()->quantized_dimension();
+    if (quantized_axis == 0) {
+      ctd->data.bin_quant.use_alternate_axis = false;
+    } else if (quantized_axis == (dims->size - 1)) {
+      ctd->data.bin_quant.use_alternate_axis = true;
+    } else {
+      MicroPrintf("Compression: unsupported quantization axis %u",
+                  quantized_axis);
+      return kTfLiteError;
+    }
+    ctd->data.bin_quant.value_table_channel_stride =
+        (value_buffer->size() / tensor_type_size) / num_channels;
+  } else {
+    ctd->data.bin_quant.is_per_channel_quantized = false;
+    ctd->data.bin_quant.use_alternate_axis = false;
+    ctd->data.bin_quant.value_table_channel_stride =
+        value_buffer->size() / tensor_type_size;
+  }
+
+  return kTfLiteOk;
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace internal
 
 size_t MicroAllocator::GetDefaultTailUsage(bool is_memory_planner_given) {
@@ -502,7 +654,11 @@ SubgraphAllocations* MicroAllocator::StartModelAllocation(const Model* model) {
     return nullptr;
   }
 
-  if (AllocateTfLiteEvalTensors(model, output) != kTfLiteOk ||
+  if (
+#ifdef USE_TFLM_COMPRESSION
+      AllocateCompressedTensorsList(model, output) != kTfLiteOk ||
+#endif  // USE_TFLM_COMPRESSION
+      AllocateTfLiteEvalTensors(model, output) != kTfLiteOk ||
       AllocateNodeAndRegistrations(model, output) != kTfLiteOk) {
     return nullptr;
   }
@@ -757,6 +913,96 @@ bool MicroAllocator::IsAllTempDeallocated() {
   return non_persistent_buffer_allocator_->IsAllTempDeallocated();
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TfLiteStatus MicroAllocator::AllocateCompressedTensorsList(
+    const Model* model, SubgraphAllocations* subgraph_allocations) {
+  TFLITE_DCHECK(subgraph_allocations != nullptr);
+
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs()->size();
+       subgraph_idx++) {
+    subgraph_allocations[subgraph_idx].compressed.tensors = nullptr;
+  }
+
+  const tflite::micro::compression::Metadata* compression_metadata =
+      internal::GetCompressionMetadata(*model);
+  if (compression_metadata == nullptr) {
+    // no compression metadata is available
+    return kTfLiteOk;
+  }
+  if (compression_metadata->lut_tensors() == nullptr) {
+    MicroPrintf("Compression: invalid LutTensor vector");
+    return kTfLiteError;
+  }
+  if (compression_metadata->lut_tensors()->size() == 0) {
+    MicroPrintf("Compression: zero length LutTensor vector");
+    return kTfLiteError;
+  }
+
+  for (size_t lut_tensors_index = 0;
+       lut_tensors_index < compression_metadata->lut_tensors()->size();
+       lut_tensors_index++) {
+    auto lut_tensor =
+        compression_metadata->lut_tensors()->Get(lut_tensors_index);
+
+    CompressionTensorData* ctd = reinterpret_cast<CompressionTensorData*>(
+        persistent_buffer_allocator_->AllocatePersistentBuffer(
+            sizeof(CompressionTensorData), alignof(CompressionTensorData)));
+    if (ctd == nullptr) {
+      MicroPrintf(
+          "Compressions: failed to allocate memory for CompressionTensorData, "
+          "%d bytes required",
+          sizeof(CompressionTensorData));
+      return kTfLiteError;
+    }
+
+    TfLiteStatus status =
+        internal::InitializeCompressionTensorDataFromFlatbuffer(
+            *model, *lut_tensor, ctd);
+    if (status != kTfLiteOk) {
+      MicroPrintf("Compression: failed to initialize data for LutTensor %u",
+                  lut_tensors_index);
+      return kTfLiteError;
+    }
+
+    const size_t subgraph_index = lut_tensor->subgraph();
+    if (subgraph_allocations[subgraph_index].compressed.tensors == nullptr) {
+      size_t alloc_count =
+          model->subgraphs()->Get(subgraph_index)->tensors()->size();
+      CompressionTensorData** tensors =
+          reinterpret_cast<CompressionTensorData**>(
+              persistent_buffer_allocator_->AllocatePersistentBuffer(
+                  sizeof(CompressionTensorData*) * alloc_count,
+                  alignof(CompressionTensorData*)));
+      if (tensors == nullptr) {
+        MicroPrintf(
+            "Compression: failed to allocate memory for compression tensor "
+            "list, %d bytes required",
+            sizeof(CompressionTensorData*) * alloc_count);
+        return kTfLiteError;
+      }
+
+      subgraph_allocations[subgraph_index].compressed.tensors = tensors;
+      std::fill(tensors, tensors + alloc_count, nullptr);
+    }
+
+    const size_t tensor_index = lut_tensor->tensor();
+    if (subgraph_allocations[subgraph_index].compressed.tensors[tensor_index] !=
+        nullptr) {
+      MicroPrintf("Compression: duplicate LutTensor subgraph %u tensor %u",
+                  subgraph_index, tensor_index);
+      return kTfLiteError;
+    } else {
+      subgraph_allocations[subgraph_index].compressed.tensors[tensor_index] =
+          ctd;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors(
     const Model* model, SubgraphAllocations* subgraph_allocations) {
   TFLITE_DCHECK(subgraph_allocations != nullptr);
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index 4eff167d67f..7a52c44bccf 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -26,6 +26,12 @@ limitations under the License.
 #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
+#ifdef USE_TFLM_COMPRESSION
+
+#include "tensorflow/lite/micro/compression.h"
+
+#endif  // USE_TFLM_COMPRESSION
+
 namespace tflite {
 
 // TODO(b/199402574): rename to tflite_internal or just remove internal
@@ -91,6 +97,9 @@ struct ScratchBufferHandle {
 struct SubgraphAllocations {
   NodeAndRegistration* node_and_registrations;
   TfLiteEvalTensor* tensors;
+#ifdef USE_TFLM_COMPRESSION
+  CompressedTensorList compressed;
+#endif  // USE_TFLM_COMPRESSION
 };
 
 // Allocator responsible for allocating memory for all intermediate tensors
@@ -258,6 +267,15 @@ class MicroAllocator {
                  MicroMemoryPlanner* memory_planner);
   virtual ~MicroAllocator();
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // Allocates an array in the arena of pointers to the compressions data
+  // required to decompress tensors for each subgraph within the model.
+  virtual TfLiteStatus AllocateCompressedTensorsList(
+      const Model* model, SubgraphAllocations* subgraph_allocations);
+
+#endif  // USE_TFLM_COMPRESSION
+
   // Allocates an array in the arena to hold pointers to the node and
   // registration pointers required to represent the inference graph of the
   // model.
diff --git a/tensorflow/lite/micro/micro_context.cc b/tensorflow/lite/micro/micro_context.cc
index 295b3c34463..55af3e39021 100644
--- a/tensorflow/lite/micro/micro_context.cc
+++ b/tensorflow/lite/micro/micro_context.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@ limitations under the License.
 #include <cstdarg>
 #include <cstddef>
 
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_common.h"
 #include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 namespace {
@@ -34,6 +36,76 @@ int GetTensorIndex(int index, int max_size, const int* tensor_indices) {
   return -1;
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+// TODO(ddavis-2015): break this up such that template expansion is decreased
+template <typename T>
+T* DecompressToBuffer(const uint8_t* compressed_indices,
+                      const size_t count_indices, void* buffer,
+                      const CompressionTensorData& comp_data,
+                      const size_t num_channels) {
+  const size_t compressed_bit_width =
+      comp_data.data.bin_quant.compressed_bit_width;
+  TFLITE_DCHECK(compressed_bit_width <= BinQuantData::kMaxBitWidth);
+  TFLITE_DCHECK(compressed_bit_width > 0);
+
+  size_t channel = 0;
+  size_t index_in_channel = 0;
+  const size_t elements_per_channel =
+      comp_data.data.bin_quant.use_alternate_axis
+          ? 1
+          : count_indices / num_channels;
+  size_t buffer_index = 0;
+  size_t table_index = 0;
+  size_t table_index_bits_to_fill = compressed_bit_width;
+  size_t current_offset = 0;
+  size_t current_bits_remaining = 8;
+  uint8_t current_byte = compressed_indices[current_offset];
+
+  // no division (other than power of 2) inside loop
+  while (buffer_index < count_indices) {
+    while (table_index_bits_to_fill > 0) {
+      if (current_bits_remaining == 0) {
+        current_offset++;
+        current_byte = compressed_indices[current_offset];
+        current_bits_remaining = 8;
+      }
+
+      const uint8_t mask_bit_count =
+          std::min(table_index_bits_to_fill,
+                   std::min(compressed_bit_width, current_bits_remaining));
+      const uint8_t current_byte_mask = (1 << mask_bit_count) - 1;
+      table_index <<= mask_bit_count;
+      table_index |=
+          (current_byte >> (current_bits_remaining - mask_bit_count)) &
+          current_byte_mask;
+
+      table_index_bits_to_fill -= mask_bit_count;
+      current_bits_remaining -= mask_bit_count;
+    }
+
+    static_cast<T*>(buffer)[buffer_index] =
+        static_cast<const T*>(comp_data.data.bin_quant.value_table)
+            [table_index +
+             (channel * comp_data.data.bin_quant.value_table_channel_stride)];
+    buffer_index++;
+    table_index_bits_to_fill = compressed_bit_width;
+    table_index = 0;
+    index_in_channel++;
+    if (index_in_channel == elements_per_channel) {
+      index_in_channel = 0;
+      channel++;
+      if (channel == num_channels) {
+        channel = 0;
+      }
+    }
+  }
+
+  return static_cast<T*>(buffer);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace
 
 TfLiteTensor* MicroContext::AllocateTempInputTensor(const TfLiteNode* node,
@@ -74,4 +146,65 @@ void MicroContextReportOpError(struct TfLiteContext* context,
   va_end(args);
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+void* MicroContext::DecompressTensorToScratchBuffer(
+    const TfLiteEvalTensor& tensor,
+    const CompressionTensorData& compression_data, int scratch_buffer_handle) {
+  TFLITE_DCHECK(compression_data.scheme == CompressionScheme::kBinQuant);
+  TFLITE_DCHECK(scratch_buffer_handle != -1);
+  void* scratch_buffer = GetScratchBuffer(scratch_buffer_handle);
+  TFLITE_DCHECK(scratch_buffer != nullptr);
+  size_t count = ElementCount(*tensor.dims);
+  size_t num_channels = 1;
+
+  if (compression_data.data.bin_quant.is_per_channel_quantized) {
+    const size_t channel_axis =
+        compression_data.data.bin_quant.use_alternate_axis
+            ? tensor.dims->size - 1
+            : 0;
+    num_channels = tensor.dims->data[channel_axis];
+  }
+
+  switch (tensor.type) {
+    case kTfLiteBool: {
+      return DecompressToBuffer<bool>(static_cast<uint8_t*>(tensor.data.data),
+                                      count, scratch_buffer, compression_data,
+                                      num_channels);
+    } break;
+    case kTfLiteInt8: {
+      return DecompressToBuffer<int8_t>(static_cast<uint8_t*>(tensor.data.data),
+                                        count, scratch_buffer, compression_data,
+                                        num_channels);
+    } break;
+    case kTfLiteInt16: {
+      return DecompressToBuffer<int16_t>(
+          static_cast<uint8_t*>(tensor.data.data), count, scratch_buffer,
+          compression_data, num_channels);
+    } break;
+    case kTfLiteInt32: {
+      return DecompressToBuffer<int32_t>(
+          static_cast<uint8_t*>(tensor.data.data), count, scratch_buffer,
+          compression_data, num_channels);
+    } break;
+    case kTfLiteInt64: {
+      return DecompressToBuffer<int64_t>(
+          static_cast<uint8_t*>(tensor.data.data), count, scratch_buffer,
+          compression_data, num_channels);
+    } break;
+    case kTfLiteFloat32: {
+      return DecompressToBuffer<float>(static_cast<uint8_t*>(tensor.data.data),
+                                       count, scratch_buffer, compression_data,
+                                       num_channels);
+    } break;
+    default: {
+      MicroPrintf("Unsupported decompression tensor type %d", tensor.type);
+    } break;
+  }
+
+  return nullptr;
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_context.h b/tensorflow/lite/micro/micro_context.h
index 2dd3233a159..33cad89143c 100644
--- a/tensorflow/lite/micro/micro_context.h
+++ b/tensorflow/lite/micro/micro_context.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,6 +19,12 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/micro_graph.h"
 
+#ifdef USE_TFLM_COMPRESSION
+
+#include "tensorflow/lite/micro/compression.h"
+
+#endif  // USE_TFLM_COMPRESSION
+
 namespace tflite {
 // TODO(b/149795762): kTfLiteAbort cannot be part of the tflite TfLiteStatus.
 const TfLiteStatus kTfLiteAbort = static_cast<TfLiteStatus>(15);
@@ -95,6 +101,30 @@ class MicroContext {
 
   virtual MicroGraph& graph() = 0;
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // Available during Prepare & Eval. Returns false if tensor is not
+  // compressed.
+  virtual bool IsTensorCompressed(const TfLiteNode* node, int tensor_idx) = 0;
+
+  // Only available during Prepare. The kernel is responsible for storing the
+  // scratch buffer handle.
+  virtual int AllocateDecompressionScratchBuffer(const TfLiteNode* node,
+                                                 int tensor_idx) = 0;
+
+  // Available during Prepare & Eval. Returns nullptr if tensor is not
+  // compressed.
+  virtual const CompressionTensorData* GetTensorCompressionData(
+      const TfLiteNode* node, int tensor_idx) = 0;
+
+  // Only available during Eval. Returns nullptr on failure, otherwise returns a
+  // pointer to the scratch buffer.
+  virtual void* DecompressTensorToScratchBuffer(
+      const TfLiteEvalTensor& tensor,
+      const CompressionTensorData& compression_data, int scratch_buffer_handle);
+
+#endif  // USE_TFLM_COMPRESSION
+
  private:
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/micro_interpreter_context.cc b/tensorflow/lite/micro/micro_interpreter_context.cc
index 098df15d522..2d6341d1894 100644
--- a/tensorflow/lite/micro/micro_interpreter_context.cc
+++ b/tensorflow/lite/micro/micro_interpreter_context.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,8 +18,29 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
+
+namespace {
+
+#ifdef USE_TFLM_COMPRESSION
+
+int GetInputTensorIndex(const TfLiteNode* node, const int index) {
+  if (index >= 0 && index < node->inputs->size) {
+    const int tensor_index = node->inputs->data[index];
+    if (tensor_index != kTfLiteOptionalTensor) {
+      return tensor_index;
+    }
+  }
+  return -1;
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
+}  // namespace
+
 MicroInterpreterContext::MicroInterpreterContext(MicroAllocator* allocator,
                                                  const Model* model,
                                                  MicroInterpreterGraph* graph)
@@ -106,4 +127,83 @@ MicroInterpreterContext::GetInterpreterState() const {
   return state_;
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+// Available during Prepare & Eval. Returns false if tensor is not
+// compressed.
+bool MicroInterpreterContext::IsTensorCompressed(const TfLiteNode* node,
+                                                 int tensor_idx) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare ||
+                state_ == InterpreterState::kInvoke);
+
+  const SubgraphAllocations* allocations =
+      &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()];
+  if (allocations->compressed.tensors == nullptr) {
+    return false;
+  }
+  int index = GetInputTensorIndex(node, tensor_idx);
+  if (index == -1) {
+    return false;
+  }
+  return allocations->compressed.tensors[index] != nullptr;
+}
+
+// Only available during Prepare. The kernel is responsible for storing the
+// scratch buffer handle.
+int MicroInterpreterContext::AllocateDecompressionScratchBuffer(
+    const TfLiteNode* node, int tensor_idx) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare);
+
+  const SubgraphAllocations* allocations =
+      &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()];
+  if (allocations->compressed.tensors == nullptr) {
+    return -1;
+  }
+  int index = GetInputTensorIndex(node, tensor_idx);
+  if (index == -1 || allocations->compressed.tensors[index] == nullptr) {
+    return -1;
+  }
+  const TfLiteEvalTensor* tensor = &allocations->tensors[index];
+  const size_t byte_count = EvalTensorBytes(tensor);
+  int scratch_index = -1;
+  TfLiteStatus result = RequestScratchBufferInArena(byte_count, &scratch_index);
+  if (result != kTfLiteOk) {
+    return -1;
+  }
+
+  return scratch_index;
+}
+
+// Available during Prepare & Eval. Returns nullptr if tensor is not
+// compressed.
+const CompressionTensorData* MicroInterpreterContext::GetTensorCompressionData(
+    const TfLiteNode* node, int tensor_idx) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare ||
+                state_ == InterpreterState::kInvoke);
+
+  const SubgraphAllocations* allocations =
+      &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()];
+  if (allocations->compressed.tensors == nullptr) {
+    return nullptr;
+  }
+  int index = GetInputTensorIndex(node, tensor_idx);
+  if (index == -1) {
+    return nullptr;
+  }
+  return allocations->compressed.tensors[index];
+}
+
+// Only available during Eval. Returns nullptr on failure, otherwise returns a
+// pointer to the scratch buffer.
+void* MicroInterpreterContext::DecompressTensorToScratchBuffer(
+    const TfLiteEvalTensor& tensor,
+    const CompressionTensorData& compression_data, int scratch_buffer_handle) {
+  TFLITE_DCHECK(state_ == InterpreterState::kInvoke);
+
+  return MicroContext::DecompressTensorToScratchBuffer(tensor, compression_data,
+                                                       scratch_buffer_handle);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_interpreter_context.h b/tensorflow/lite/micro/micro_interpreter_context.h
index 5986dc37fd2..7b336aacea9 100644
--- a/tensorflow/lite/micro/micro_interpreter_context.h
+++ b/tensorflow/lite/micro/micro_interpreter_context.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -106,6 +106,31 @@ class MicroInterpreterContext : public MicroContext {
   // housekeeping in MicroInterpreterContext.
   void SetScratchBufferHandles(ScratchBufferHandle* scratch_buffer_handles);
 
+#ifdef USE_TFLM_COMPRESSION
+
+  // Available during Prepare & Eval. Returns false if tensor is not
+  // compressed.
+  bool IsTensorCompressed(const TfLiteNode* node, int tensor_idx) override;
+
+  // Only available during Prepare. The kernel is responsible for storing the
+  // scratch buffer handle.
+  int AllocateDecompressionScratchBuffer(const TfLiteNode* node,
+                                         int tensor_idx) override;
+
+  // Available during Prepare & Eval. Returns nullptr if tensor is not
+  // compressed.
+  const CompressionTensorData* GetTensorCompressionData(
+      const TfLiteNode* node, int tensor_idx) override;
+
+  // Only available during Eval. Returns nullptr on failure, otherwise returns a
+  // pointer to the scratch buffer.
+  void* DecompressTensorToScratchBuffer(
+      const TfLiteEvalTensor& tensor,
+      const CompressionTensorData& compression_data,
+      int scratch_buffer_handle) override;
+
+#endif  // USE_TFLM_COMPRESSION
+
  private:
   MicroAllocator& allocator_;
   MicroInterpreterGraph& graph_;
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index e44de6b09aa..873ea96ac1e 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_interpreter.h"
 
 #include <cstdint>
+#include <initializer_list>
 
 #include "tensorflow/lite/micro/arena_allocator/recording_single_arena_buffer_allocator.h"
 #include "tensorflow/lite/micro/compatibility.h"
@@ -108,6 +109,58 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
   TF_LITE_MICRO_EXPECT_EQ(tflite::testing::MockCustom::freed_, true);
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(TestInterpreterCompression) {
+  const tflite::Model* model = tflite::testing::GetSimpleMockModelCompressed();
+  TF_LITE_MICRO_EXPECT(nullptr != model);
+  tflite::testing::TestingOpResolver op_resolver;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          tflite::testing::GetTestingOpResolver(op_resolver));
+
+  constexpr size_t allocator_buffer_size = 2000;
+  uint8_t allocator_buffer[allocator_buffer_size];
+
+  // Create a new scope so that we can test the destructor.
+  {
+    tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
+                                         allocator_buffer_size);
+    TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.outputs_size());
+
+    TfLiteTensor* input = interpreter.input(0);
+    TF_LITE_MICRO_EXPECT(nullptr != input);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt16, input->type);
+    TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
+    TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), input->bytes);
+    TF_LITE_MICRO_EXPECT(nullptr != input->data.data);
+    static_cast<int16_t*>(input->data.data)[0] = 42;
+
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
+
+    const std::initializer_list<int16_t> kGolden = {
+        43, 44, 45, 46, 47, 41, 40, 39, 38, 37, 43, 44, 45, 46, 47};
+    const int kGoldenCount = kGolden.size();
+    TfLiteTensor* output = interpreter.output(0);
+    TF_LITE_MICRO_EXPECT(nullptr != output);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt16, output->type);
+    TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
+    TF_LITE_MICRO_EXPECT_EQ(kGoldenCount, output->dims->data[0]);
+    TF_LITE_MICRO_EXPECT_EQ(
+        static_cast<size_t>(kGoldenCount * sizeof(*kGolden.begin())),
+        output->bytes);
+    TF_LITE_MICRO_EXPECT(nullptr != output->data.data);
+    for (int i = 0; i < kGoldenCount; i++) {
+      TF_LITE_MICRO_EXPECT_EQ(static_cast<int16_t*>(output->data.data)[i],
+                              kGolden.begin()[i]);
+    }
+  }
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 TF_LITE_MICRO_TEST(TestMultiTenantInterpreter) {
   tflite::testing::TestingOpResolver op_resolver;
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
diff --git a/tensorflow/lite/micro/recording_micro_allocator.cc b/tensorflow/lite/micro/recording_micro_allocator.cc
index f41dba61d7d..18addaee5f7 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -78,6 +78,12 @@ RecordedAllocation RecordingMicroAllocator::GetRecordedAllocation(
       return recorded_node_and_registration_array_data_;
     case RecordedAllocationType::kOpData:
       return recorded_op_data_;
+#ifdef USE_TFLM_COMPRESSION
+    case RecordedAllocationType::kCompressionData:
+      return recorded_compression_data_;
+#endif  // USE_TFLM_COMPRESSION
+    default:
+      break;
   }
   MicroPrintf("Invalid allocation type supplied: %d", allocation_type);
   return RecordedAllocation();
@@ -112,6 +118,13 @@ void RecordingMicroAllocator::PrintAllocations() const {
                           "NodeAndRegistration structs");
   PrintRecordedAllocation(RecordedAllocationType::kOpData,
                           "Operator runtime data", "OpData structs");
+
+#ifdef USE_TFLM_COMPRESSION
+
+  PrintRecordedAllocation(RecordedAllocationType::kCompressionData,
+                          "Persistent compression data", "allocations");
+
+#endif  // USE_TFLM_COMPRESSION
 }
 
 void* RecordingMicroAllocator::AllocatePersistentBuffer(size_t bytes) {
@@ -228,6 +241,21 @@ TfLiteStatus RecordingMicroAllocator::PopulateTfLiteTensorFromFlatbuffer(
   return status;
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TfLiteStatus RecordingMicroAllocator::AllocateCompressedTensorsList(
+    const Model* model, SubgraphAllocations* subgraph_allocations) {
+  RecordedAllocation allocations = SnapshotAllocationUsage();
+
+  TfLiteStatus status = MicroAllocator::AllocateCompressedTensorsList(
+      model, subgraph_allocations);
+
+  RecordAllocationUsage(allocations, recorded_compression_data_);
+  return status;
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 RecordedAllocation RecordingMicroAllocator::SnapshotAllocationUsage() const {
   return {/*requested_bytes=*/recording_memory_allocator_->GetRequestedBytes(),
           /*used_bytes=*/recording_memory_allocator_->GetUsedBytes(),
diff --git a/tensorflow/lite/micro/recording_micro_allocator.h b/tensorflow/lite/micro/recording_micro_allocator.h
index b6f69264dc0..80f163240d3 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.h
+++ b/tensorflow/lite/micro/recording_micro_allocator.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -33,6 +33,11 @@ enum class RecordedAllocationType {
   kTfLiteTensorVariableBufferData,
   kNodeAndRegistrationArray,
   kOpData,
+#ifdef USE_TFLM_COMPRESSION
+  kCompressionData,
+#endif  // USE_TFLM_COMPRESSION
+
+  kNumAllocationTypes,  // must be last
 };
 
 // Container for holding information about allocation recordings by a given
@@ -93,6 +98,13 @@ class RecordingMicroAllocator : public MicroAllocator {
                                                   int subgraph_index,
                                                   bool allocate_temp) override;
 
+#ifdef USE_TFLM_COMPRESSION
+
+  TfLiteStatus AllocateCompressedTensorsList(
+      const Model* model, SubgraphAllocations* subgraph_allocations) override;
+
+#endif  // USE_TFLM_COMPRESSION
+
  private:
   RecordingMicroAllocator(RecordingSingleArenaBufferAllocator* memory_allocator,
                           MicroMemoryPlanner* memory_planner);
@@ -113,6 +125,9 @@ class RecordingMicroAllocator : public MicroAllocator {
   RecordedAllocation recorded_persistent_buffer_data_ = {};
   RecordedAllocation recorded_tflite_tensor_variable_buffer_data_ = {};
   RecordedAllocation recorded_node_and_registration_array_data_ = {};
+#ifdef USE_TFLM_COMPRESSION
+  RecordedAllocation recorded_compression_data_ = {};
+#endif  // USE_TFLM_COMPRESSION
 
   // TODO(b/187993291): Re-enable OpData allocating tracking.
   RecordedAllocation recorded_op_data_ = {};
diff --git a/tensorflow/lite/micro/recording_micro_allocator_test.cc b/tensorflow/lite/micro/recording_micro_allocator_test.cc
index 9d3a5965de4..1c4df8862a4 100644
--- a/tensorflow/lite/micro/recording_micro_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -317,6 +317,70 @@ TF_LITE_MICRO_TEST(TestMultiSubgraphModel) {
                           num_tensors * TF_LITE_EVAL_TENSOR_STRUCT_SIZE);
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+TF_LITE_MICRO_TEST(TestCompressedModel) {
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
+  tflite::testing::TestingOpResolver ops_resolver;
+  const tflite::Model* model = tflite::testing::GetSimpleMockModelCompressed();
+  const int arena_size = 2048;
+
+  uint8_t arena[arena_size];
+
+  tflite::RecordingMicroAllocator* micro_allocator =
+      tflite::RecordingMicroAllocator::Create(arena, arena_size);
+  TF_LITE_MICRO_EXPECT(micro_allocator != nullptr);
+  TF_LITE_MICRO_CHECK_FAIL();
+
+  tflite::SubgraphAllocations* subgraph_allocations =
+      micro_allocator->StartModelAllocation(model);
+  TF_LITE_MICRO_EXPECT(nullptr != subgraph_allocations);
+  TF_LITE_MICRO_CHECK_FAIL();
+
+  TfLiteStatus status = micro_allocator->FinishModelAllocation(
+      model, subgraph_allocations, &scratch_buffer_handles);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  TF_LITE_MICRO_CHECK_FAIL();
+
+  micro_allocator->PrintAllocations();
+
+  size_t count_compression_allocations = 0;
+  size_t size_compression_allocations = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs()->size();
+       subgraph_idx++) {
+    tflite::CompressionTensorData** ctl =
+        subgraph_allocations[subgraph_idx].compressed.tensors;
+    if (ctl == nullptr) {
+      continue;
+    }
+    const tflite::SubGraph* subgraph = model->subgraphs()->Get(subgraph_idx);
+    const size_t num_tensors = subgraph->tensors()->size();
+    for (size_t i = 0; i < num_tensors; i++) {
+      if (ctl[i] != nullptr) {
+        count_compression_allocations++;
+        size_compression_allocations += sizeof(tflite::CompressionTensorData);
+      }
+    }
+    // Add the CompressionTensorData array
+    count_compression_allocations++;
+    size_compression_allocations +=
+        num_tensors * sizeof(tflite::CompressionTensorData*);
+  }
+
+  tflite::RecordedAllocation recorded_allocation =
+      micro_allocator->GetRecordedAllocation(
+          tflite::RecordedAllocationType::kCompressionData);
+
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count,
+                          count_compression_allocations);
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
+                          size_compression_allocations);
+  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
+                          size_compression_allocations);
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 // TODO(b/158124094): Find a way to audit OpData allocations on
 // cross-architectures.
 
diff --git a/tensorflow/lite/micro/test_helper_custom_ops.cc b/tensorflow/lite/micro/test_helper_custom_ops.cc
index 374aabcc9df..97577699961 100644
--- a/tensorflow/lite/micro/test_helper_custom_ops.cc
+++ b/tensorflow/lite/micro/test_helper_custom_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -35,6 +35,18 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
+namespace {
+
+template <typename T>
+void BroadcastAdd(const T input_scalar, const T* weights, T* output,
+                  const size_t count) {
+  for (size_t i = 0; i < count; i++) {
+    output[i] = input_scalar + weights[i];
+  }
+}
+
+}  // namespace
+
 const TFLMRegistration* PackerOp::getRegistration() {
   return GetMutableRegistration();
 }
@@ -107,5 +119,180 @@ TfLiteStatus PackerOp::Invoke(TfLiteContext* context, TfLiteNode* node) {
 
 bool PackerOp::freed_ = false;
 
+const TFLMRegistration* BroadcastAddOp::getRegistration() {
+  return GetMutableRegistration();
+}
+
+TFLMRegistration* BroadcastAddOp::GetMutableRegistration() {
+  static TFLMRegistration r;
+  r.init = Init;
+  r.prepare = Prepare;
+  r.invoke = Invoke;
+  return &r;
+}
+
+void* BroadcastAddOp::Init(TfLiteContext* context, const char* buffer,
+                           size_t length) {
+#ifdef USE_TFLM_COMPRESSION
+
+  weight_scratch_index_ = -1;
+
+#endif  // USE_TFLM_COMPRESSION
+
+  // Do nothing.
+  return nullptr;
+}
+
+TfLiteStatus BroadcastAddOp::Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* weights = micro_context->AllocateTempInputTensor(node, 1);
+  TF_LITE_ENSURE(context, weights != nullptr);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, weights->type);
+  TF_LITE_ENSURE(
+      context, input->type == kTfLiteFloat32 || input->type == kTfLiteInt8 ||
+                   input->type == kTfLiteInt16 || input->type == kTfLiteInt32 ||
+                   input->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context, input->quantization.type == kTfLiteNoQuantization);
+  TF_LITE_ENSURE(context, weights->quantization.type == kTfLiteNoQuantization);
+  TF_LITE_ENSURE(context, output->quantization.type == kTfLiteNoQuantization);
+  TF_LITE_ENSURE(context,
+                 ElementCount(*weights->dims) == ElementCount(*output->dims));
+  TF_LITE_ENSURE(context, ElementCount(*input->dims) == 1);
+  TF_LITE_ENSURE(context, input->dims->size == 1);
+  TF_LITE_ENSURE(context, weights->dims->size == 1);
+
+#ifdef USE_TFLM_COMPRESSION
+
+  // Compression scratch buffers.
+  // These will only be allocated if the tensor is compressed.
+  weight_scratch_index_ =
+      micro_context->AllocateDecompressionScratchBuffer(node, 1);
+  if (micro_context->IsTensorCompressed(node, 1)) {
+    TF_LITE_ENSURE(context, weight_scratch_index_ != -1);
+  } else {
+    TF_LITE_ENSURE(context, weight_scratch_index_ == -1);
+  }
+
+#endif  // USE_TFLM_COMPRESSION
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(weights);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus BroadcastAddOp::Invoke(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteEvalTensor* weights =
+      tflite::micro::GetEvalInput(context, node, 1);
+  TF_LITE_ENSURE(context, weights != nullptr);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+#ifdef USE_TFLM_COMPRESSION
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* weights_comp_td =
+      micro_context->GetTensorCompressionData(node, 1);
+  if (micro_context->IsTensorCompressed(node, 1)) {
+    TF_LITE_ENSURE(context, weights_comp_td != nullptr);
+  } else {
+    TF_LITE_ENSURE(context, weights_comp_td == nullptr);
+  }
+
+#endif  // USE_TFLM_COMPRESSION
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      BroadcastAdd(
+          tflite::micro::GetTensorData<float>(input)[0],
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(
+              micro_context, weights, weights_comp_td, weight_scratch_index_),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(weights),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(output),
+          ElementCount(*output->dims));
+    } break;
+
+    case kTfLiteInt8: {
+      BroadcastAdd(
+          tflite::micro::GetTensorData<int8_t>(input)[0],
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int8_t>(
+              micro_context, weights, weights_comp_td, weight_scratch_index_),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int8_t>(weights),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int8_t>(output),
+          ElementCount(*output->dims));
+    } break;
+
+    case kTfLiteInt16: {
+      BroadcastAdd(
+          tflite::micro::GetTensorData<int16_t>(input)[0],
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int16_t>(
+              micro_context, weights, weights_comp_td, weight_scratch_index_),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int16_t>(weights),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int16_t>(output),
+          ElementCount(*output->dims));
+    } break;
+
+    case kTfLiteInt32: {
+      BroadcastAdd(
+          tflite::micro::GetTensorData<int32_t>(input)[0],
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int32_t>(
+              micro_context, weights, weights_comp_td, weight_scratch_index_),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int32_t>(weights),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int32_t>(output),
+          ElementCount(*output->dims));
+    } break;
+
+    case kTfLiteInt64: {
+      BroadcastAdd(
+          tflite::micro::GetTensorData<int64_t>(input)[0],
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int64_t>(
+              micro_context, weights, weights_comp_td, weight_scratch_index_),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int64_t>(weights),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<int64_t>(output),
+          ElementCount(*output->dims));
+    } break;
+
+    default: {
+      MicroPrintf("Input type %s (%d) not supported.",
+                  TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+#ifdef USE_TFLM_COMPRESSION
+
+int BroadcastAddOp::weight_scratch_index_ = -1;
+
+#endif  // USE_TFLM_COMPRESSION
+
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/test_helper_custom_ops.h b/tensorflow/lite/micro/test_helper_custom_ops.h
index d28bb4038f1..53a8cc3bdd4 100644
--- a/tensorflow/lite/micro/test_helper_custom_ops.h
+++ b/tensorflow/lite/micro/test_helper_custom_ops.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -43,6 +43,23 @@ class PackerOp {
   static bool freed_;
 };
 
+// This op optionally supports compressed weights
+class BroadcastAddOp {
+ public:
+  static const TFLMRegistration* getRegistration();
+  static TFLMRegistration* GetMutableRegistration();
+  static void* Init(TfLiteContext* context, const char* buffer, size_t length);
+  static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
+  static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node);
+
+ private:
+#ifdef USE_TFLM_COMPRESSION
+
+  static int weight_scratch_index_;  // decompression scratch buffer index
+
+#endif  // USE_TFLM_COMPRESSION
+};
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 3f0f5ec0826..d5f50773f69 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/test_helpers.h"
 
+#include <array>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
@@ -33,6 +34,12 @@ limitations under the License.
 #include "tensorflow/lite/micro/test_helper_custom_ops.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
+#ifdef USE_TFLM_COMPRESSION
+
+#include "tensorflow/lite/micro/compression/metadata_generated.h"
+
+#endif  // USE_TFLM_COMPRESSION
+
 // TODO(b/170464050): Use TFLM test only version of schema_utils.
 
 namespace tflite {
@@ -236,7 +243,7 @@ const Model* ModelBuilder::BuildModel(
         *builder_, 0,
         builder_->CreateVector(operator_codes_, next_operator_code_id_),
         builder_->CreateVector(subgraphs, subgraphs_size),
-        builder_->CreateString("teset_model"),
+        builder_->CreateString("test_model"),
         builder_->CreateVector(buffers, buffer_size), 0,
         builder_->CreateVector(metadata_,
                                ModelBuilder::nbr_of_metadata_buffers_));
@@ -245,7 +252,7 @@ const Model* ModelBuilder::BuildModel(
         *builder_, 0,
         builder_->CreateVector(operator_codes_, next_operator_code_id_),
         builder_->CreateVector(subgraphs, subgraphs_size),
-        builder_->CreateString("teset_model"),
+        builder_->CreateString("test_model"),
         builder_->CreateVector(buffers, buffer_size));
   }
 
@@ -578,6 +585,116 @@ const Model* BuildSimpleMockModel() {
   return model;
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+const flatbuffers::span<uint8_t> BuildLutMetadata(
+    const std::initializer_list<tflite::micro::compression::LutTensor>&
+        lut_tensor_structs) {
+  using flatbuffers::Offset;
+  namespace compression = tflite::micro::compression;
+
+  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
+  auto lut_tensors = builder->CreateVectorOfStructs(lut_tensor_structs.begin(),
+                                                    lut_tensor_structs.size());
+  auto metadata = compression::CreateMetadata(*builder, lut_tensors);
+  compression::FinishMetadataBuffer(*builder, metadata);
+  return builder->GetBufferSpan();
+}
+
+const Model* BuildSimpleMockModelCompressed() {
+  using flatbuffers::Offset;
+  using flatbuffers::Vector;
+  using tflite::micro::compression::LutTensor;
+  constexpr uint kEmptyBuffer = 0;
+  constexpr uint kMetadataBuffer = 1;
+  constexpr uint kWeightsBuffer = 2;
+  constexpr uint kValueTableBuffer = 3;
+  // constexpr uint kInputTensor = 0;
+  constexpr uint kWeightsTensor = 1;
+  // constexpr uint kOutputTensor = 2;
+  constexpr uint kSubgraphIndex = 0;
+  constexpr uint kCompressedBitWidth = 4;
+
+  const std::initializer_list<LutTensor> lut_tensors = {
+      LutTensor(kSubgraphIndex, kWeightsTensor, kCompressedBitWidth,
+                kWeightsBuffer, kValueTableBuffer),
+  };
+  auto lut_tensors_span = BuildLutMetadata(lut_tensors);
+
+  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
+
+  // [1, 2, 3, 4, 5, -1, -2, -3, -4, -5, 1, 2, 3, 4, 5]
+  const std::initializer_list<uint8_t> weights_data = {0x01, 0x23, 0x45, 0x98,
+                                                       0x76, 0x01, 0x23, 0x40};
+  const std::initializer_list<int16_t> value_table_data = {1,  2,  3,  4,  5,
+                                                           -1, -5, -4, -3, -2};
+  auto value_table_offset = builder->CreateVector(value_table_data).o;
+  const std::initializer_list<Offset<Buffer>> buffers = {
+      CreateBuffer(*builder),
+      CreateBuffer(*builder, builder->CreateVector<uint8_t>(lut_tensors_span)),
+      CreateBuffer(*builder, builder->CreateVector(weights_data)),
+      CreateBuffer(*builder, Offset<Vector<uint8_t>>(value_table_offset)),
+  };
+
+  const std::initializer_list<int32_t> input_shape = {1};
+  const std::initializer_list<int32_t> weights_shape = {15};
+  const std::initializer_list<int32_t> output_shape = weights_shape;
+  const std::initializer_list<Offset<Tensor>> tensors = {
+      CreateTensor(*builder, builder->CreateVector(input_shape),
+                   TensorType_INT16, kEmptyBuffer,
+                   builder->CreateString("test_input_tensor"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(weights_shape),
+                   TensorType_INT16, kWeightsBuffer,
+                   builder->CreateString("test_weight_tensor"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(output_shape),
+                   TensorType_INT16, kEmptyBuffer,
+                   builder->CreateString("test_output_tensor"), 0, false),
+  };
+
+  const std::initializer_list<int32_t> subgraph_inputs = {0};
+  const std::initializer_list<int32_t> subgraph_outputs = {2};
+  const std::initializer_list<int32_t> operator_inputs = {0, 1};
+  const std::initializer_list<int32_t> operator_outputs = {2};
+  const std::initializer_list<Offset<Operator>> operators = {
+      CreateOperator(*builder, 0, builder->CreateVector(operator_inputs),
+                     builder->CreateVector(operator_outputs),
+                     BuiltinOptions_NONE),
+  };
+
+  const std::initializer_list<Offset<SubGraph>> subgraphs = {
+      CreateSubGraph(*builder, builder->CreateVector(tensors),
+                     builder->CreateVector(subgraph_inputs),
+                     builder->CreateVector(subgraph_outputs),
+                     builder->CreateVector(operators),
+                     builder->CreateString("test_subgraph")),
+  };
+
+  const std::initializer_list<Offset<OperatorCode>> operator_codes = {
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0,
+                               "broadcast_add_op",
+                               /*version=*/0, BuiltinOperator_CUSTOM),
+  };
+
+  const std::initializer_list<Offset<Metadata>> metadata = {
+      CreateMetadata(*builder,
+                     builder->CreateString(kCompressionMetadataString),
+                     kMetadataBuffer),
+  };
+
+  const Offset<Model> model_offset = CreateModel(
+      *builder, 0, builder->CreateVector(operator_codes),
+      builder->CreateVector(subgraphs), builder->CreateString("test_model"),
+      builder->CreateVector(buffers), 0, builder->CreateVector(metadata));
+
+  FinishModelBuffer(*builder, model_offset);
+  void* model_pointer = builder->GetBufferPointer();
+  const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
+
+  return model;
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 const Model* BuildComplexMockModel() {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
@@ -1665,6 +1782,8 @@ TfLiteStatus GetTestingOpResolver(
       op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration()));
   TF_LITE_ENSURE_STATUS(op_resolver.AddCustom(
       "custom_packer_op", PackerOp::GetMutableRegistration()));
+  TF_LITE_ENSURE_STATUS(op_resolver.AddCustom(
+      "broadcast_add_op", BroadcastAddOp::GetMutableRegistration()));
   TF_LITE_ENSURE_STATUS(op_resolver.AddIf());
   return kTfLiteOk;
 }
@@ -1698,6 +1817,18 @@ const Model* GetSimpleMockModel() {
   return model;
 }
 
+#ifdef USE_TFLM_COMPRESSION
+
+const Model* GetSimpleMockModelCompressed() {
+  static Model* model = nullptr;
+  if (!model) {
+    model = const_cast<Model*>(BuildSimpleMockModelCompressed());
+  }
+  return model;
+}
+
+#endif  // USE_TFLM_COMPRESSION
+
 const Model* GetSimpleMultipleInputsModel() {
   static Model* model = nullptr;
   if (!model) {
@@ -1890,6 +2021,7 @@ TfLiteFloatArray* FloatArrayFromFloats(const float* floats) {
   return reinterpret_cast<TfLiteFloatArray*>(const_cast<float*>(floats));
 }
 
+// TODO(ddavis-2015): make template
 TfLiteTensor CreateQuantizedBiasTensor(const float* data, int16_t* quantized,
                                        TfLiteIntArray* dims, float input_scale,
                                        float weights_scale, bool is_variable) {
@@ -1904,6 +2036,7 @@ TfLiteTensor CreateQuantizedBiasTensor(const float* data, int16_t* quantized,
   return result;
 }
 
+// TODO(ddavis-2015): make template
 TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized,
                                        TfLiteIntArray* dims, float input_scale,
                                        float weights_scale, bool is_variable) {
@@ -1918,6 +2051,7 @@ TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized,
   return result;
 }
 
+// TODO(ddavis-2015): make template
 TfLiteTensor CreateQuantizedBiasTensor(const float* data,
                                        std::int64_t* quantized,
                                        TfLiteIntArray* dims, float input_scale,
@@ -1933,37 +2067,7 @@ TfLiteTensor CreateQuantizedBiasTensor(const float* data,
   return result;
 }
 
-// Quantizes int32_t bias tensor with per-channel weights determined by input
-// scale multiplied by weight scale for each channel.
-template <typename T>
-TfLiteTensor CreatePerChannelQuantizedBiasTensor(
-    const float* input, T* quantized, TfLiteIntArray* dims, float input_scale,
-    float* weight_scales, float* scales, int* zero_points,
-    TfLiteAffineQuantization* affine_quant, int quantized_dimension,
-    bool is_variable) {
-  int input_size = ElementCount(*dims);
-  int num_channels = dims->data[quantized_dimension];
-  // First element is reserved for array length
-  zero_points[0] = num_channels;
-  scales[0] = static_cast<float>(num_channels);
-  float* scales_array = &scales[1];
-  for (int i = 0; i < num_channels; i++) {
-    scales_array[i] = input_scale * weight_scales[i];
-    zero_points[i + 1] = 0;
-  }
-
-  SymmetricPerChannelQuantize<T>(input, quantized, input_size, num_channels,
-                                 scales_array);
-
-  affine_quant->scale = FloatArrayFromFloats(scales);
-  affine_quant->zero_point = IntArrayFromInts(zero_points);
-  affine_quant->quantized_dimension = quantized_dimension;
-
-  TfLiteTensor result = CreateTensor(quantized, dims, is_variable);
-  result.quantization = {kTfLiteAffineQuantization, affine_quant};
-  return result;
-}
-
+// TODO(ddavis-2015): remove
 TfLiteTensor CreatePerChannelQuantizedBiasTensor(
     const float* input, int32_t* quantized, TfLiteIntArray* dims,
     float input_scale, float* weight_scales, float* scales, int* zero_points,
@@ -1974,6 +2078,7 @@ TfLiteTensor CreatePerChannelQuantizedBiasTensor(
       affine_quant, quantized_dimension, is_variable);
 }
 
+// TODO(ddavis-2015): remove
 TfLiteTensor CreatePerChannelQuantizedBiasTensor(
     const float* input, std::int64_t* quantized, TfLiteIntArray* dims,
     float input_scale, float* weight_scales, float* scales, int* zero_points,
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 6315b9fecdc..bad6e47d672 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -112,6 +112,15 @@ TfLiteStatus GetTestingOpResolver(TestingOpResolver& op_resolver);
 // 1 layer of weights, 1 output Tensor, and 1 operator.
 const Model* GetSimpleMockModel();
 
+#ifdef USE_TFLM_COMPRESSION
+
+// Returns a simple example flatbuffer TensorFlow Lite model. Contains 1 input,
+// 1 layer of weights, 1 output Tensor, and 1 operator (BroadcastAddOp).  The
+// weights tensor is compressed.
+const Model* GetSimpleMockModelCompressed();
+
+#endif  // USE_TFLM_COMPRESSION
+
 // Returns a flatbuffer TensorFlow Lite model with more inputs, variable
 // tensors, and operators.
 const Model* GetComplexMockModel();
@@ -220,8 +229,6 @@ TfLiteTensor CreateTensor(const T* data, TfLiteIntArray* dims,
   result.is_variable = is_variable;
   result.allocation_type = kTfLiteMemNone;
   result.data.data = const_cast<T*>(data);
-  result.bytes = ElementCount(*dims) * sizeof(T);
-  result.data.data = const_cast<T*>(data);
 
   if (type == kTfLiteInt4) {
     result.type = kTfLiteInt4;
@@ -233,7 +240,13 @@ TfLiteTensor CreateTensor(const T* data, TfLiteIntArray* dims,
     // a single CreateTensor method. A Const array should be used for immutable
     // input tensors and non-const array should be used for mutable and output
     // tensors.
-    result.type = typeToTfLiteType<T>();
+    if (type == kTfLiteNoType) {
+      result.type = typeToTfLiteType<T>();
+    } else {
+      result.type = type;
+    }
+
+    result.bytes = ElementCount(*dims) * TfLiteTypeGetSize(result.type);
   }
   return result;
 }
@@ -260,37 +273,106 @@ TfLiteTensor CreateQuantizedTensor(const float* input, T* quantized,
                                type);
 }
 
+// TODO(ddavis-2015): remove
 TfLiteTensor CreateQuantizedBiasTensor(const float* data, int16_t* quantized,
                                        TfLiteIntArray* dims, float input_scale,
                                        float weights_scale,
                                        bool is_variable = false);
 
+// TODO(ddavis-2015): remove
 TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized,
                                        TfLiteIntArray* dims, float input_scale,
                                        float weights_scale,
                                        bool is_variable = false);
 
+// TODO(ddavis-2015): remove
 TfLiteTensor CreateQuantizedBiasTensor(const float* data,
                                        std::int64_t* quantized,
                                        TfLiteIntArray* dims, float input_scale,
                                        float weights_scale,
                                        bool is_variable = false);
 
-// Quantizes int32_t bias tensor with per-channel weights determined by input
-// scale multiplied by weight scale for each channel.
+// Creates bias tensor with pre-calculated compressed input data and per-channel
+// weights determined by input scale multiplied by weight scale for each
+// channel.
+template <typename T>
 TfLiteTensor CreatePerChannelQuantizedBiasTensor(
-    const float* input, int32_t* quantized, TfLiteIntArray* dims,
-    float input_scale, float* weight_scales, float* scales, int* zero_points,
-    TfLiteAffineQuantization* affine_quant, int quantized_dimension,
-    bool is_variable = false);
+    const T* input_data, TfLiteIntArray* dims, float input_scale,
+    const TfLiteFloatArray* weight_scales, TfLiteFloatArray* scales,
+    TfLiteIntArray* zero_points, TfLiteAffineQuantization* affine_quant,
+    int quantized_dimension, bool is_variable = false,
+    TfLiteType type = kTfLiteNoType) {
+  int num_channels = dims->data[quantized_dimension];
+  zero_points->size = num_channels;
+  scales->size = num_channels;
+  for (int i = 0; i < num_channels; i++) {
+    scales->data[i] = input_scale * weight_scales->data[i];
+    zero_points->data[i] = 0;
+    MicroPrintf("index %d scales %f zero_point %d input scale %f weight %f", i,
+                (double)scales->data[i], zero_points->data[i],
+                (double)input_scale, (double)weight_scales->data[i]);
+  }
+
+  affine_quant->scale = scales;
+  affine_quant->zero_point = zero_points;
+  affine_quant->quantized_dimension = quantized_dimension;
+
+  TfLiteTensor result = CreateTensor(input_data, dims, is_variable, type);
+  result.quantization = {kTfLiteAffineQuantization, affine_quant};
+  return result;
+}
 
-// Quantizes int64_t bias tensor with per-channel weights determined by input
+// Quantizes bias tensor with per-channel weights determined by input
 // scale multiplied by weight scale for each channel.
+template <typename T>
 TfLiteTensor CreatePerChannelQuantizedBiasTensor(
-    const float* input, std::int64_t* quantized, TfLiteIntArray* dims,
-    float input_scale, float* weight_scales, float* scales, int* zero_points,
+    const float* input, T* quantized, TfLiteIntArray* dims, float input_scale,
+    const float* weight_scales, float* scales, int* zero_points,
     TfLiteAffineQuantization* affine_quant, int quantized_dimension,
-    bool is_variable = false);
+    bool is_variable = false) {
+  int input_size = ElementCount(*dims);
+  int num_channels = dims->data[quantized_dimension];
+  // First element is reserved for array length
+  zero_points[0] = num_channels;
+  scales[0] = static_cast<float>(num_channels);
+  float* scales_array = &scales[1];
+  for (int i = 0; i < num_channels; i++) {
+    scales_array[i] = input_scale * weight_scales[i];
+    zero_points[i + 1] = 0;
+    MicroPrintf("index %d scales %f zero_point %d input scale %f weight %f", i,
+                (double)scales_array[i], zero_points[i + 1],
+                (double)input_scale, (double)weight_scales[i]);
+  }
+
+  SymmetricPerChannelQuantize<T>(input, quantized, input_size, num_channels,
+                                 scales_array);
+
+  affine_quant->scale = FloatArrayFromFloats(scales);
+  affine_quant->zero_point = IntArrayFromInts(zero_points);
+  affine_quant->quantized_dimension = quantized_dimension;
+
+  TfLiteTensor result = CreateTensor(quantized, dims, is_variable);
+  result.quantization = {kTfLiteAffineQuantization, affine_quant};
+  int64_t data0 = quantized[0];
+  MicroPrintf("quantp %p data %f data quantized %lld", affine_quant,
+              (double)input[0], data0);
+  return result;
+}
+
+template <typename T>
+TfLiteTensor CreatePerChannelQuantizedTensor(
+    const T* quantized, TfLiteIntArray* dims, TfLiteFloatArray* scales,
+    TfLiteIntArray* zero_points, TfLiteAffineQuantization* affine_quant,
+    int quantized_dimension, bool is_variable = false,
+    TfLiteType type = kTfLiteNoType) {
+  affine_quant->scale = scales;
+  affine_quant->zero_point = zero_points;
+  affine_quant->quantized_dimension = quantized_dimension;
+
+  TfLiteTensor result = CreateTensor(quantized, dims, is_variable, type);
+  result.quantization = {kTfLiteAffineQuantization, affine_quant};
+  return result;
+}
 
 TfLiteTensor CreateSymmetricPerChannelQuantizedTensor(
     const float* input, int8_t* quantized, TfLiteIntArray* dims, float* scales,
diff --git a/tensorflow/lite/micro/testing/micro_test.h b/tensorflow/lite/micro/testing/micro_test.h
index a28f4b6d8e4..1e17531efea 100644
--- a/tensorflow/lite/micro/testing/micro_test.h
+++ b/tensorflow/lite/micro/testing/micro_test.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -264,4 +264,11 @@ inline void InitializeTest() { InitializeTarget(); }
     }                                                                        \
   } while (false)
 
+#define TF_LITE_MICRO_CHECK_FAIL()   \
+  do {                               \
+    if (micro_test::did_test_fail) { \
+      return kTfLiteError;           \
+    }                                \
+  } while (false)
+
 #endif  // TENSORFLOW_LITE_MICRO_TESTING_MICRO_TEST_H_
diff --git a/tensorflow/lite/micro/tools/benchmarking/metrics.cc b/tensorflow/lite/micro/tools/benchmarking/metrics.cc
index 3a4bf7e4917..f71a4cd139e 100644
--- a/tensorflow/lite/micro/tools/benchmarking/metrics.cc
+++ b/tensorflow/lite/micro/tools/benchmarking/metrics.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -46,7 +46,8 @@ struct LogAllocationRecord {
 constexpr int kArenaRows = 3;
 constexpr int kArenaColumns = 3;
 
-constexpr int kAllocationTypes = 7;
+constexpr int kAllocationTypes =
+    static_cast<int>(tflite::RecordedAllocationType::kNumAllocationTypes);
 constexpr int kAllocationColumns = 6;
 
 constexpr int kMaxBufSize = 100;
@@ -85,16 +86,25 @@ LogAllocationRecord GetLogAllocationRecord(
       tflite::RecordedAllocationType::kPersistentBufferData,
       tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData,
       tflite::RecordedAllocationType::kNodeAndRegistrationArray,
-      tflite::RecordedAllocationType::kOpData};
+      tflite::RecordedAllocationType::kOpData,
+#ifdef USE_TFLM_COMPRESSION
+      tflite::RecordedAllocationType::kCompressionData,
+#endif  // USE_TFLM_COMPRESSION
+  };
   static_assert(std::extent<decltype(types)>::value == kAllocationTypes,
                 "kAllocationTypes mismatch");
-  const char* titles[] = {"Eval tensor data",
-                          "Persistent tensor data",
-                          "Persistent quantization data",
-                          "Persistent buffer data",
-                          "Tensor variable buffer data",
-                          "Node and registration array",
-                          "Operation data"};
+  const char* titles[] = {
+      "Eval tensor data",
+      "Persistent tensor data",
+      "Persistent quantization data",
+      "Persistent buffer data",
+      "Tensor variable buffer data",
+      "Node and registration array",
+      "Operation data",
+#ifdef USE_TFLM_COMPRESSION
+      "Compression data",
+#endif  // USE_TFLM_COMPRESSION
+  };
   static_assert(std::extent<decltype(titles)>::value == kAllocationTypes,
                 "kAllocationTypes mismatch");
   const size_t total_bytes =
diff --git a/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh b/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh
index 998827f24de..f5392dddeec 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh
@@ -41,6 +41,12 @@ readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/M
 readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile test TENSORFLOW_ROOT=${TENSORFLOW_ROOT} EXTERNAL_DIR=${EXTERNAL_DIR}
 readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile integration_tests TENSORFLOW_ROOT=${TENSORFLOW_ROOT} EXTERNAL_DIR=${EXTERNAL_DIR}
 
+# optional TFLM tensor compression - execute the unit tests
+readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile test \
+  TENSORFLOW_ROOT=${TENSORFLOW_ROOT} \
+  EXTERNAL_DIR=${EXTERNAL_DIR} \
+  USE_TFLM_COMPRESSION=yes
+
 # run generic benchmark
 readable_run make -j$(nproc) -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile \
   TENSORFLOW_ROOT=${TENSORFLOW_ROOT} \
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 90b0c2945ff..45a7af96382 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -270,6 +270,17 @@ endif
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
+# TFLM optional compression support (default disabled)
+ENABLE_COMPRESSION := no
+ifneq ($(USE_TFLM_COMPRESSION),)
+  # currently only Linux targets supported
+  ifeq ($(TARGET), $(filter $(TARGET), linux))
+    CXXFLAGS += -DUSE_TFLM_COMPRESSION
+    CCFLAGS += -DUSE_TFLM_COMPRESSION
+    ENABLE_COMPRESSION := yes
+  endif
+endif
+
 # Where compiled objects are stored.
 BASE_GENDIR := gen
 GENDIR := $(BASE_GENDIR)/$(TARGET)_$(TARGET_ARCH)_$(BUILD_TYPE)
@@ -279,6 +290,9 @@ endif
 ifneq ($(CO_PROCESSOR),)
   GENDIR := $(GENDIR)_$(CO_PROCESSOR)
 endif
+ifeq ($(ENABLE_COMPRESSION), yes)
+  GENDIR := $(GENDIR)_compression
+endif
 GENDIR := $(GENDIR)_$(TOOLCHAIN)/
 
 CORE_OBJDIR := $(GENDIR)obj/core/