From 3364d9186de768375b7b532ea3c314197c7317e3 Mon Sep 17 00:00:00 2001
From: Leo <leo@powdrlabs.com>
Date: Wed, 9 Oct 2024 19:10:45 +0200
Subject: [PATCH] Memory16 (#1871)

Extracted from https://github.com/powdr-labs/powdr/pull/1790

This PR adds std and witgen support for the new 16-bit limb memory
machine.

- The current version only supports 24-bit addresses. I think it's fine
for now, but we should fix it later.
- Github fails miserably at showing the proper diff, but:
- the new file `double_sorted_32.rs` is the same as the old unique
`double_sorted.rs`, minus the common parts which
- were left in the outermost `double_sorted.rs` which dispatches calls
depending on the field
- the new file `double_sorted_16.rs` is very similar to the 32 case, but
adjusted for 2 value fields.
- There are probably more common things we can extract, but this here is
enough for a first version.

---------

Co-authored-by: Leo Alt <leo@ethereum.org>
Co-authored-by: Georg Wiese <georgwiese@gmail.com>
---
 .../double_sorted_witness_machine_16.rs       | 559 ++++++++++++++++++
 ...rs => double_sorted_witness_machine_32.rs} |  10 +-
 .../src/witgen/machines/machine_extractor.rs  |  18 +-
 executor/src/witgen/machines/mod.rs           |  23 +-
 pipeline/tests/asm.rs                         |  12 +-
 pipeline/tests/powdr_std.rs                   |   7 +
 std/machines/memory16.asm                     | 116 ++++
 std/machines/mod.asm                          |   1 +
 std/machines/range.asm                        |  12 +
 test_data/std/memory16_test.asm               |  73 +++
 10 files changed, 808 insertions(+), 23 deletions(-)
 create mode 100644 executor/src/witgen/machines/double_sorted_witness_machine_16.rs
 rename executor/src/witgen/machines/{double_sorted_witness_machine.rs => double_sorted_witness_machine_32.rs} (98%)
 create mode 100644 std/machines/memory16.asm
 create mode 100644 test_data/std/memory16_test.asm
diff --git a/executor/src/witgen/machines/double_sorted_witness_machine_16.rs b/executor/src/witgen/machines/double_sorted_witness_machine_16.rs
new file mode 100644
index 0000000000..be6a8354d0
--- /dev/null
+++ b/executor/src/witgen/machines/double_sorted_witness_machine_16.rs
@@ -0,0 +1,559 @@
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::iter::once;
+
+use itertools::Itertools;
+
+use super::{Machine, MachineParts};
+use crate::witgen::rows::RowPair;
+use crate::witgen::util::try_to_simple_poly;
+use crate::witgen::{EvalError, EvalResult, FixedData, MutableState, QueryCallback};
+use crate::witgen::{EvalValue, IncompleteCause};
+use powdr_number::{DegreeType, FieldElement, LargeInt};
+
+use powdr_ast::analyzed::{DegreeRange, IdentityKind, PolyID};
+
+/// If all witnesses of a machine have a name in this list (disregarding the namespace),
+/// we'll consider it to be a double-sorted machine.
+/// This does not include the selectors, which are dynamically added to this list.
+const ALLOWED_WITNESSES: [&str; 11] = [
+    "m_value1",
+    "m_value2",
+    "m_addr_high",
+    "m_addr_low",
+    "m_step_high",
+    "m_step_low",
+    "m_change",
+    "m_is_write",
+    "m_is_bootloader_write",
+    "m_tmp1",
+    "m_tmp2",
+];
+
+const DIFF_COLUMNS: [&str; 2] = ["m_tmp1", "m_tmp2"];
+const BOOTLOADER_WRITE_COLUMN: &str = "m_is_bootloader_write";
+
+// The operation ID is decomposed into m_is_write + 2 * m_is_bootloader_write
+const OPERATION_ID_WRITE: u64 = 1;
+const OPERATION_ID_BOOTLOADER_WRITE: u64 = 2;
+
+/// A 32-bit word, represented as two 16-bit limbs (big endian).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+struct Word32<T>(T, T);
+
+impl<T: FieldElement> From<Word32<T>> for u64 {
+    fn from(value: Word32<T>) -> Self {
+        [value.0, value.1]
+            .iter()
+            .map(|a| {
+                let value = a.to_integer().try_into_u64().unwrap();
+                // We expect 16-Bit limbs
+                TryInto::<u16>::try_into(value).expect("Expected 16-Bit limbs")
+            })
+            .fold(0u64, |acc, x| (acc << 16) | (x as u64))
+    }
+}
+
+fn split_column_name(name: &str) -> (&str, &str) {
+    let mut limbs = name.split("::");
+    let namespace = limbs.next().unwrap();
+    let col = limbs.next().unwrap();
+    (namespace, col)
+}
+
+/// TODO make this generic
+
+pub struct DoubleSortedWitnesses16<'a, T: FieldElement> {
+    degree_range: DegreeRange,
+    degree: DegreeType,
+    //key_col: String,
+    /// Position of the witness columns in the data.
+    /// The key column has a position of usize::max
+    //witness_positions: HashMap<String, usize>,
+    /// (addr, step) -> (value1, value2)
+    trace: BTreeMap<(Word32<T>, T), Operation<T>>,
+    data: BTreeMap<u64, u64>,
+    is_initialized: BTreeMap<Word32<T>, bool>,
+    namespace: String,
+    name: String,
+    parts: MachineParts<'a, T>,
+    /// If the machine has the `m_diff_upper` and `m_diff_lower` columns, this is the base of the
+    /// two digits.
+    diff_columns_base: Option<u64>,
+    /// Whether this machine has a `m_is_bootloader_write` column.
+    has_bootloader_write_column: bool,
+    /// All selector IDs that are used on the right-hand side connecting identities.
+    selector_ids: BTreeMap<u64, PolyID>,
+}
+
+struct Operation<T> {
+    pub is_normal_write: bool,
+    pub is_bootloader_write: bool,
+    pub value: Word32<T>,
+    pub selector_id: PolyID,
+}
+
+impl<'a, T: FieldElement> DoubleSortedWitnesses16<'a, T> {
+    fn namespaced(&self, name: &str) -> String {
+        format!("{}::{}", self.namespace, name)
+    }
+
+    pub fn try_new(
+        name: String,
+        fixed_data: &'a FixedData<'a, T>,
+        parts: &MachineParts<'a, T>,
+    ) -> Option<Self> {
+        let degree_range = parts.common_degree_range();
+
+        let degree = degree_range.max;
+
+        // get the namespaces and column names
+        let (mut namespaces, columns): (HashSet<_>, HashSet<_>) = parts
+            .witnesses
+            .iter()
+            .map(|r| split_column_name(parts.column_name(r)))
+            .unzip();
+
+        if namespaces.len() > 1 {
+            // columns are not in the same namespace, fail
+            return None;
+        }
+
+        if !parts
+            .connecting_identities
+            .values()
+            .all(|i| i.kind == IdentityKind::Permutation)
+        {
+            return None;
+        }
+
+        let selector_ids = parts
+            .connecting_identities
+            .values()
+            .map(|i| {
+                i.right
+                    .selector
+                    .as_ref()
+                    .and_then(|r| try_to_simple_poly(r))
+                    .map(|p| (i.id, p.poly_id))
+            })
+            .collect::<Option<BTreeMap<_, _>>>()?;
+
+        let namespace = namespaces.drain().next().unwrap().into();
+
+        // TODO check the identities.
+        let selector_names = selector_ids
+            .values()
+            .map(|s| split_column_name(parts.column_name(s)).1);
+        let allowed_witnesses: HashSet<_> = ALLOWED_WITNESSES
+            .into_iter()
+            .chain(selector_names)
+            .collect();
+        if !columns.iter().all(|c| allowed_witnesses.contains(c)) {
+            return None;
+        }
+
+        let has_diff_columns = DIFF_COLUMNS.iter().all(|c| columns.contains(c));
+        let has_bootloader_write_column = columns.contains(&BOOTLOADER_WRITE_COLUMN);
+
+        let diff_columns_base = if has_diff_columns {
+            // We have the `m_tmp1` and `m_tmp2` columns.
+            // These have different semantics, depending on whether the the address changes in the next
+            // row. But in both cases, the range constraint of m_tmp2 will give us the base.
+            let lower_poly_id =
+                fixed_data.try_column_by_name(&format!("{namespace}::{}", DIFF_COLUMNS[1]))?;
+            let lower_range_constraint = fixed_data.global_range_constraints().witness_constraints
+                [&lower_poly_id]
+                .as_ref()?;
+
+            let (min, max) = lower_range_constraint.range();
+
+            if min == T::zero() {
+                Some(max.to_degree() + 1)
+            } else {
+                return None;
+            }
+        } else {
+            None
+        };
+
+        if !parts.prover_functions.is_empty() {
+            log::warn!(
+                "DoubleSortedWitness16 machine does not support prover functions.\
+                The following prover functions are ignored:\n{}",
+                parts.prover_functions.iter().format("\n")
+            );
+        }
+
+        Some(Self {
+            name,
+            degree_range,
+            namespace,
+            parts: parts.clone(), // TODO is this really unused?
+            degree,
+            diff_columns_base,
+            has_bootloader_write_column,
+            trace: Default::default(),
+            data: Default::default(),
+            is_initialized: Default::default(),
+            selector_ids,
+        })
+    }
+}
+
+impl<'a, T: FieldElement> Machine<'a, T> for DoubleSortedWitnesses16<'a, T> {
+    fn identity_ids(&self) -> Vec<u64> {
+        self.selector_ids.keys().cloned().collect()
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn process_plookup<Q: QueryCallback<T>>(
+        &mut self,
+        _mutable_state: &mut MutableState<'a, '_, T, Q>,
+        identity_id: u64,
+        caller_rows: &RowPair<'_, 'a, T>,
+    ) -> EvalResult<'a, T> {
+        self.process_plookup_internal(identity_id, caller_rows)
+    }
+
+    fn take_witness_col_values<'b, Q: QueryCallback<T>>(
+        &mut self,
+        _mutable_state: &'b mut MutableState<'a, 'b, T, Q>,
+    ) -> HashMap<String, Vec<T>> {
+        let mut addr: Vec<Word32<T>> = vec![];
+        let mut step = vec![];
+        let mut value: Vec<Word32<T>> = vec![];
+        let mut is_normal_write = vec![];
+        let mut is_bootloader_write = vec![];
+        let mut selectors = self
+            .selector_ids
+            .values()
+            .map(|id| (id, Vec::new()))
+            .collect::<BTreeMap<_, _>>();
+        let mut set_selector = |selector_id: Option<PolyID>| {
+            for (id, v) in selectors.iter_mut() {
+                v.push(if Some(**id) == selector_id {
+                    T::one()
+                } else {
+                    T::zero()
+                })
+            }
+        };
+
+        for ((current_address, current_step), operation) in std::mem::take(&mut self.trace) {
+            let address_int: u64 = current_address.into();
+            if let Some(prev_address) = addr.last() {
+                let prev_address_int: u64 = (*prev_address).into();
+                assert!(
+                    current_address >= *prev_address,
+                    "Expected addresses to be sorted"
+                );
+                if self.diff_columns_base.is_none() && address_int - prev_address_int >= self.degree
+                {
+                    log::error!("Jump in memory accesses between {prev_address_int:x} and {address_int:x} is larger than or equal to the degree {}! This will violate the constraints.", self.degree);
+                }
+
+                let current_diff = if current_address != *prev_address {
+                    address_int - prev_address_int
+                } else {
+                    (current_step - *step.last().unwrap()).to_degree()
+                };
+                assert!(current_diff > 0);
+            }
+
+            addr.push(current_address);
+            step.push(current_step);
+            value.push(operation.value);
+
+            is_normal_write.push(operation.is_normal_write.into());
+            is_bootloader_write.push(operation.is_bootloader_write.into());
+            set_selector(Some(operation.selector_id));
+        }
+        if addr.is_empty() {
+            // No memory access at all - fill a first row with something.
+            addr.push(Word32(-T::one(), -T::one()));
+            step.push(0.into());
+            value.push(Word32(0.into(), 0.into()));
+            is_normal_write.push(0.into());
+            is_bootloader_write.push(0.into());
+            set_selector(None);
+        }
+
+        let current_size = addr.len();
+        let new_size = current_size.next_power_of_two() as DegreeType;
+        let new_size = self.degree_range.fit(new_size);
+        log::info!(
+            "Resizing variable length machine '{}': {} -> {} (rounded up from {})",
+            self.name,
+            self.degree,
+            new_size,
+            current_size
+        );
+        self.degree = new_size;
+
+        while addr.len() < self.degree as usize {
+            addr.push(*addr.last().unwrap());
+            step.push(*step.last().unwrap() + T::from(1));
+            value.push(*value.last().unwrap());
+            is_normal_write.push(0.into());
+            is_bootloader_write.push(0.into());
+            set_selector(None);
+        }
+
+        let last_row_change_value = match self.has_bootloader_write_column {
+            true => (&addr[0] != addr.last().unwrap()).into(),
+            // In the machine without the bootloader write column, m_change is constrained
+            // to be 1 in the last row.
+            false => 1.into(),
+        };
+
+        let change = addr
+            .iter()
+            .tuple_windows()
+            .map(|(a, a_next)| if a == a_next { 0.into() } else { 1.into() })
+            .chain(once(last_row_change_value))
+            .collect::<Vec<_>>();
+        assert_eq!(change.len(), addr.len());
+
+        let diff_columns = if let Some(diff_columns_base) = self.diff_columns_base {
+            let (diff_col1, diff_col2): (Vec<T>, Vec<T>) = change
+                .iter()
+                .enumerate()
+                .map(|(i, address_change)| {
+                    if address_change == &T::zero() {
+                        // We are comparing the time step. The diff columns should contain the
+                        // high and low limb of the difference - 1.
+                        // Note that i + 1 will never be out of bounds because m_change is constrained
+                        // to be 1 in the last row.
+                        let diff = (step[i + 1] - step[i]).to_degree();
+                        assert!(diff > 0);
+                        (
+                            T::from((diff - 1) / diff_columns_base),
+                            T::from((diff - 1) % diff_columns_base),
+                        )
+                    } else {
+                        // We are comparing the address. The first value should store whether the high
+                        // 16-Bit limbs are equal; the second value should store the diff - 1 of the
+                        // limb being compared
+
+                        // Get the current and next address. The next address is None for the last row.
+                        let current_addr = &addr[i];
+                        let next_addr = addr.get(i + 1);
+
+                        next_addr
+                            .map(|next_addr| {
+                                if current_addr.0 == next_addr.0 {
+                                    assert!(current_addr.1 < next_addr.1);
+                                    (T::zero(), next_addr.1 - current_addr.1 - T::one())
+                                } else {
+                                    assert!(current_addr.0 < next_addr.0);
+                                    (T::one(), next_addr.0 - current_addr.0 - T::one())
+                                }
+                            })
+                            // On the last row, the diff value is unconstrained, choose (0, 0).
+                            .unwrap_or((T::zero(), T::zero()))
+                    }
+                })
+                .unzip();
+            vec![
+                (self.namespaced(DIFF_COLUMNS[0]), diff_col1),
+                (self.namespaced(DIFF_COLUMNS[1]), diff_col2),
+            ]
+        } else {
+            vec![]
+        };
+
+        let bootloader_columns = if self.has_bootloader_write_column {
+            vec![(
+                self.namespaced(BOOTLOADER_WRITE_COLUMN),
+                is_bootloader_write.clone(),
+            )]
+        } else {
+            vec![]
+        };
+
+        let (addr_high, addr_low) = addr
+            .into_iter()
+            .map(|a| (a.0, a.1))
+            .unzip::<_, _, Vec<_>, Vec<_>>();
+
+        let (value_high, value_low) = value
+            .into_iter()
+            .map(|v| (v.0, v.1))
+            .unzip::<_, _, Vec<_>, Vec<_>>();
+
+        let (step_high, step_low) = step
+            .into_iter()
+            .map(|s| {
+                let s = s.to_degree();
+                (T::from(s >> 16), T::from(s & 0xffff))
+            })
+            .unzip::<_, _, Vec<_>, Vec<_>>();
+
+        let selector_columns = selectors
+            .into_iter()
+            .map(|(id, v)| (self.parts.column_name(id).to_string(), v))
+            .collect::<Vec<_>>();
+
+        [
+            (self.namespaced("m_value1"), value_high),
+            (self.namespaced("m_value2"), value_low),
+            (self.namespaced("m_addr_high"), addr_high),
+            (self.namespaced("m_addr_low"), addr_low),
+            (self.namespaced("m_step_high"), step_high),
+            (self.namespaced("m_step_low"), step_low),
+            (self.namespaced("m_change"), change),
+            (self.namespaced("m_is_write"), is_normal_write.clone()),
+        ]
+        .into_iter()
+        .chain(diff_columns)
+        .chain(bootloader_columns)
+        .chain(selector_columns)
+        .collect()
+    }
+}
+
+impl<'a, T: FieldElement> DoubleSortedWitnesses16<'a, T> {
+    pub fn process_plookup_internal(
+        &mut self,
+        identity_id: u64,
+        caller_rows: &RowPair<'_, 'a, T>,
+    ) -> EvalResult<'a, T> {
+        // We blindly assume the lookup is of the form
+        // OP { operation_id, ADDR_high, ADDR_low, STEP, X_high, X_low } is
+        // <selector> { operation_id, m_addr_high, m_addr_low, m_step, m_value_high, m_value_low }
+        // Where:
+        // - operation_id == 0: Read
+        // - operation_id == 1: Write
+        // - operation_id == 2: Bootloader write
+
+        let args = self.parts.connecting_identities[&identity_id]
+            .left
+            .expressions
+            .iter()
+            .map(|e| caller_rows.evaluate(e).unwrap())
+            .collect::<Vec<_>>();
+
+        let operation_id = match args[0].constant_value() {
+            Some(v) => v,
+            None => {
+                return Ok(EvalValue::incomplete(
+                    IncompleteCause::NonConstantRequiredArgument("operation_id"),
+                ))
+            }
+        };
+
+        let selector_id = *self.selector_ids.get(&identity_id).unwrap();
+
+        let is_normal_write = operation_id == T::from(OPERATION_ID_WRITE);
+        let is_bootloader_write = operation_id == T::from(OPERATION_ID_BOOTLOADER_WRITE);
+        let is_write = is_bootloader_write || is_normal_write;
+        let addr = match (args[1].constant_value(), args[2].constant_value()) {
+            (Some(high), Some(low)) => Word32(high, low),
+            _ => {
+                return Ok(EvalValue::incomplete(
+                    IncompleteCause::NonConstantRequiredArgument("m_addr"),
+                ))
+            }
+        };
+
+        let addr_int: u64 = addr.into();
+
+        if self.has_bootloader_write_column {
+            let is_initialized = self.is_initialized.get(&addr).cloned().unwrap_or_default();
+            if !is_initialized && !is_bootloader_write {
+                panic!("Memory address {addr_int:x} must be initialized with a bootloader write",);
+            }
+            self.is_initialized.insert(addr, true);
+        }
+
+        let step = args[3]
+            .constant_value()
+            .ok_or_else(|| format!("Step must be known but is: {}", args[3]))?;
+
+        let value1_expr = &args[4];
+        let value2_expr = &args[5];
+
+        log::trace!(
+            "Query addr=0x{:x}, step={step}, write: {is_write}, value: ({} {})",
+            addr_int,
+            value1_expr,
+            value2_expr
+        );
+
+        // TODO this does not check any of the failure modes
+        let mut assignments = EvalValue::complete(vec![]);
+        let has_side_effect = if is_write {
+            let value = match (value1_expr.constant_value(), value2_expr.constant_value()) {
+                (Some(high), Some(low)) => Word32(high, low),
+                _ => {
+                    return Ok(EvalValue::incomplete(
+                        IncompleteCause::NonConstantRequiredArgument("m_value"),
+                    ))
+                }
+            };
+
+            let value_int: u64 = value.into();
+
+            log::trace!(
+                "Memory write: addr=0x{:x}, step={step}, value=0x{:x}",
+                addr_int,
+                value_int
+            );
+            self.data.insert(addr_int, value_int);
+            self.trace
+                .insert(
+                    (addr, step),
+                    Operation {
+                        is_normal_write,
+                        is_bootloader_write,
+                        value,
+                        selector_id,
+                    },
+                )
+                .is_none()
+        } else {
+            let value = self.data.entry(addr_int).or_default();
+            log::trace!(
+                "Memory read: addr=0x{:x}, step={step}, value=0x{:x}",
+                addr_int,
+                value
+            );
+
+            let value_int: u64 = *value;
+            let value_low = value_int & 0xffff;
+            let value_high = value_int >> 16;
+            let value_low_fe: T = value_low.into();
+            let value_high_fe: T = value_high.into();
+
+            let ass = (value1_expr.clone() - value_high_fe.into())
+                .solve_with_range_constraints(caller_rows)?;
+            assignments.combine(ass);
+            let ass2 = (value2_expr.clone() - value_low_fe.into())
+                .solve_with_range_constraints(caller_rows)?;
+            assignments.combine(ass2);
+            self.trace
+                .insert(
+                    (addr, step),
+                    Operation {
+                        is_normal_write,
+                        is_bootloader_write,
+                        value: Word32(value_high_fe, value_low_fe),
+                        selector_id,
+                    },
+                )
+                .is_none()
+        };
+        if has_side_effect {
+            assignments = assignments.report_side_effect();
+        }
+
+        if self.trace.len() >= (self.degree as usize) {
+            return Err(EvalError::RowsExhausted(self.name.clone()));
+        }
+
+        Ok(assignments)
+    }
+}
diff --git a/executor/src/witgen/machines/double_sorted_witness_machine.rs b/executor/src/witgen/machines/double_sorted_witness_machine_32.rs
similarity index 98%
rename from executor/src/witgen/machines/double_sorted_witness_machine.rs
rename to executor/src/witgen/machines/double_sorted_witness_machine_32.rs
index 0f49108987..2cd2c8e02c 100644
--- a/executor/src/witgen/machines/double_sorted_witness_machine.rs
+++ b/executor/src/witgen/machines/double_sorted_witness_machine_32.rs
@@ -43,7 +43,7 @@ fn split_column_name(name: &str) -> (&str, &str) {
 
 /// TODO make this generic
 
-pub struct DoubleSortedWitnesses<'a, T: FieldElement> {
+pub struct DoubleSortedWitnesses32<'a, T: FieldElement> {
     degree_range: DegreeRange,
     degree: DegreeType,
     //key_col: String,
@@ -74,7 +74,7 @@ struct Operation<T> {
     pub selector_id: PolyID,
 }
 
-impl<'a, T: FieldElement> DoubleSortedWitnesses<'a, T> {
+impl<'a, T: FieldElement> DoubleSortedWitnesses32<'a, T> {
     fn namespaced(&self, name: &str) -> String {
         format!("{}::{}", self.namespace, name)
     }
@@ -165,7 +165,7 @@ impl<'a, T: FieldElement> DoubleSortedWitnesses<'a, T> {
 
         if !parts.prover_functions.is_empty() {
             log::warn!(
-                "DoubleSortedWitness machine does not support prover functions.\
+                "DoubleSortedWitness32 machine does not support prover functions.\
                 The following prover functions are ignored:\n{}",
                 parts.prover_functions.iter().format("\n")
             );
@@ -187,7 +187,7 @@ impl<'a, T: FieldElement> DoubleSortedWitnesses<'a, T> {
     }
 }
 
-impl<'a, T: FieldElement> Machine<'a, T> for DoubleSortedWitnesses<'a, T> {
+impl<'a, T: FieldElement> Machine<'a, T> for DoubleSortedWitnesses32<'a, T> {
     fn identity_ids(&self) -> Vec<u64> {
         self.selector_ids.keys().cloned().collect()
     }
@@ -353,7 +353,7 @@ impl<'a, T: FieldElement> Machine<'a, T> for DoubleSortedWitnesses<'a, T> {
     }
 }
 
-impl<'a, T: FieldElement> DoubleSortedWitnesses<'a, T> {
+impl<'a, T: FieldElement> DoubleSortedWitnesses32<'a, T> {
     fn process_plookup_internal(
         &mut self,
         identity_id: u64,
diff --git a/executor/src/witgen/machines/machine_extractor.rs b/executor/src/witgen/machines/machine_extractor.rs
index 0013aec42c..142f501eac 100644
--- a/executor/src/witgen/machines/machine_extractor.rs
+++ b/executor/src/witgen/machines/machine_extractor.rs
@@ -3,7 +3,8 @@ use std::collections::{BTreeMap, HashSet};
 use itertools::Itertools;
 
 use super::block_machine::BlockMachine;
-use super::double_sorted_witness_machine::DoubleSortedWitnesses;
+use super::double_sorted_witness_machine_16::DoubleSortedWitnesses16;
+use super::double_sorted_witness_machine_32::DoubleSortedWitnesses32;
 use super::fixed_lookup_machine::FixedLookup;
 use super::sorted_witness_machine::SortedWitnesses;
 use super::FixedData;
@@ -217,13 +218,20 @@ fn build_machine<'a, T: FieldElement>(
     {
         log::debug!("Detected machine: sorted witnesses / write-once memory");
         KnownMachine::SortedWitnesses(machine)
-    } else if let Some(machine) = DoubleSortedWitnesses::try_new(
-        name_with_type("DoubleSortedWitnesses"),
+    } else if let Some(machine) = DoubleSortedWitnesses16::try_new(
+        name_with_type("DoubleSortedWitnesses16"),
         fixed_data,
         &machine_parts,
     ) {
-        log::debug!("Detected machine: memory");
-        KnownMachine::DoubleSortedWitnesses(machine)
+        log::debug!("Detected machine: memory16");
+        KnownMachine::DoubleSortedWitnesses16(machine)
+    } else if let Some(machine) = DoubleSortedWitnesses32::try_new(
+        name_with_type("DoubleSortedWitnesses32"),
+        fixed_data,
+        &machine_parts,
+    ) {
+        log::debug!("Detected machine: memory32");
+        KnownMachine::DoubleSortedWitnesses32(machine)
     } else if let Some(machine) = WriteOnceMemory::try_new(
         name_with_type("WriteOnceMemory"),
         fixed_data,
diff --git a/executor/src/witgen/machines/mod.rs b/executor/src/witgen/machines/mod.rs
index a00ed04dfe..63655c71d7 100644
--- a/executor/src/witgen/machines/mod.rs
+++ b/executor/src/witgen/machines/mod.rs
@@ -9,7 +9,8 @@ use powdr_number::FieldElement;
 use crate::Identity;
 
 use self::block_machine::BlockMachine;
-use self::double_sorted_witness_machine::DoubleSortedWitnesses;
+use self::double_sorted_witness_machine_16::DoubleSortedWitnesses16;
+use self::double_sorted_witness_machine_32::DoubleSortedWitnesses32;
 pub use self::fixed_lookup_machine::FixedLookup;
 use self::profiling::{record_end, record_start};
 use self::sorted_witness_machine::SortedWitnesses;
@@ -20,7 +21,8 @@ use super::rows::RowPair;
 use super::{EvalResult, FixedData, MutableState, QueryCallback};
 
 mod block_machine;
-mod double_sorted_witness_machine;
+mod double_sorted_witness_machine_16;
+mod double_sorted_witness_machine_32;
 mod fixed_lookup_machine;
 pub mod machine_extractor;
 pub mod profiling;
@@ -71,7 +73,8 @@ pub trait Machine<'a, T: FieldElement>: Send + Sync {
 /// which requires that all lifetime parameters are 'static.
 pub enum KnownMachine<'a, T: FieldElement> {
     SortedWitnesses(SortedWitnesses<'a, T>),
-    DoubleSortedWitnesses(DoubleSortedWitnesses<'a, T>),
+    DoubleSortedWitnesses16(DoubleSortedWitnesses16<'a, T>),
+    DoubleSortedWitnesses32(DoubleSortedWitnesses32<'a, T>),
     WriteOnceMemory(WriteOnceMemory<'a, T>),
     BlockMachine(BlockMachine<'a, T>),
     Vm(Generator<'a, T>),
@@ -89,7 +92,10 @@ impl<'a, T: FieldElement> Machine<'a, T> for KnownMachine<'a, T> {
             KnownMachine::SortedWitnesses(m) => {
                 m.process_plookup(mutable_state, identity_id, caller_rows)
             }
-            KnownMachine::DoubleSortedWitnesses(m) => {
+            KnownMachine::DoubleSortedWitnesses16(m) => {
+                m.process_plookup(mutable_state, identity_id, caller_rows)
+            }
+            KnownMachine::DoubleSortedWitnesses32(m) => {
                 m.process_plookup(mutable_state, identity_id, caller_rows)
             }
             KnownMachine::WriteOnceMemory(m) => {
@@ -108,7 +114,8 @@ impl<'a, T: FieldElement> Machine<'a, T> for KnownMachine<'a, T> {
     fn name(&self) -> &str {
         match self {
             KnownMachine::SortedWitnesses(m) => m.name(),
-            KnownMachine::DoubleSortedWitnesses(m) => m.name(),
+            KnownMachine::DoubleSortedWitnesses16(m) => m.name(),
+            KnownMachine::DoubleSortedWitnesses32(m) => m.name(),
             KnownMachine::WriteOnceMemory(m) => m.name(),
             KnownMachine::BlockMachine(m) => m.name(),
             KnownMachine::Vm(m) => m.name(),
@@ -122,7 +129,8 @@ impl<'a, T: FieldElement> Machine<'a, T> for KnownMachine<'a, T> {
     ) -> HashMap<String, Vec<T>> {
         match self {
             KnownMachine::SortedWitnesses(m) => m.take_witness_col_values(mutable_state),
-            KnownMachine::DoubleSortedWitnesses(m) => m.take_witness_col_values(mutable_state),
+            KnownMachine::DoubleSortedWitnesses16(m) => m.take_witness_col_values(mutable_state),
+            KnownMachine::DoubleSortedWitnesses32(m) => m.take_witness_col_values(mutable_state),
             KnownMachine::WriteOnceMemory(m) => m.take_witness_col_values(mutable_state),
             KnownMachine::BlockMachine(m) => m.take_witness_col_values(mutable_state),
             KnownMachine::Vm(m) => m.take_witness_col_values(mutable_state),
@@ -133,7 +141,8 @@ impl<'a, T: FieldElement> Machine<'a, T> for KnownMachine<'a, T> {
     fn identity_ids(&self) -> Vec<u64> {
         match self {
             KnownMachine::SortedWitnesses(m) => m.identity_ids(),
-            KnownMachine::DoubleSortedWitnesses(m) => m.identity_ids(),
+            KnownMachine::DoubleSortedWitnesses16(m) => m.identity_ids(),
+            KnownMachine::DoubleSortedWitnesses32(m) => m.identity_ids(),
             KnownMachine::WriteOnceMemory(m) => m.identity_ids(),
             KnownMachine::BlockMachine(m) => m.identity_ids(),
             KnownMachine::Vm(m) => m.identity_ids(),
diff --git a/pipeline/tests/asm.rs b/pipeline/tests/asm.rs
index 82f43768c3..80ffe87c78 100644
--- a/pipeline/tests/asm.rs
+++ b/pipeline/tests/asm.rs
@@ -247,25 +247,25 @@ fn vm_to_block_multiple_links() {
 #[test]
 fn mem_read_write() {
     let f = "asm/mem_read_write.asm";
-    regular_test(f, Default::default());
+    regular_test_without_babybear(f, Default::default());
 }
 
 #[test]
 fn mem_read_write_no_memory_accesses() {
     let f = "asm/mem_read_write_no_memory_accesses.asm";
-    regular_test(f, Default::default());
+    regular_test_without_babybear(f, Default::default());
 }
 
 #[test]
 fn mem_read_write_with_bootloader() {
     let f = "asm/mem_read_write_with_bootloader.asm";
-    regular_test(f, Default::default());
+    regular_test_without_babybear(f, Default::default());
 }
 
 #[test]
 fn mem_read_write_large_diffs() {
     let f = "asm/mem_read_write_large_diffs.asm";
-    regular_test(f, Default::default());
+    regular_test_without_babybear(f, Default::default());
 }
 
 #[test]
@@ -440,7 +440,7 @@ fn vm_args() {
 #[test]
 fn vm_args_memory() {
     let f = "asm/vm_args_memory.asm";
-    regular_test(f, Default::default());
+    regular_test_without_babybear(f, Default::default());
 }
 
 #[test]
@@ -452,7 +452,7 @@ fn vm_args_relative_path() {
 #[test]
 fn vm_args_two_levels() {
     let f = "asm/vm_args_two_levels.asm";
-    regular_test(f, Default::default());
+    regular_test_without_babybear(f, Default::default());
 }
 
 mod reparse {
diff --git a/pipeline/tests/powdr_std.rs b/pipeline/tests/powdr_std.rs
index 5575f22502..c567cc72af 100644
--- a/pipeline/tests/powdr_std.rs
+++ b/pipeline/tests/powdr_std.rs
@@ -119,6 +119,13 @@ fn memory_test_parallel_accesses() {
     regular_test(f, &[]);
 }
 
+#[test]
+#[ignore = "Too slow"]
+fn memory16_test() {
+    let f = "std/memory16_test.asm";
+    test_plonky3_with_backend_variant::<BabyBearField>(f, vec![], BackendVariant::Composite);
+}
+
 #[test]
 fn permutation_via_challenges_bn() {
     let f = "std/permutation_via_challenges.asm";
diff --git a/std/machines/memory16.asm b/std/machines/memory16.asm
new file mode 100644
index 0000000000..6b725c3035
--- /dev/null
+++ b/std/machines/memory16.asm
@@ -0,0 +1,116 @@
+use std::array;
+use std::field::modulus;
+use std::check::assert;
+use std::machines::range::Bit12;
+use std::machines::range::Byte2;
+
+// A read/write memory, similar to that of Polygon:
+// https://github.com/0xPolygonHermez/zkevm-proverjs/blob/main/pil/mem.pil
+machine Memory16(bit12: Bit12, byte2: Byte2) with
+    latch: LATCH,
+    operation_id: m_is_write,
+    call_selectors: selectors,
+{
+    // We compute m_diff (28-Bit) + m_step (28-Bit) + 1, which fits into 29 Bits.
+    assert(modulus() > 2**29, || "Memory16 requires a field that fits any 29-Bit value.");
+
+    operation mload<0> m_addr_high, m_addr_low, m_step -> m_value1, m_value2;
+    operation mstore<1> m_addr_high, m_addr_low, m_step, m_value1, m_value2 ->;
+
+    let LATCH = 1;
+
+    // =============== read-write memory =======================
+    // Read-write memory. Columns are sorted by addr and
+    // then by step. change is 1 if and only if addr changes
+    // in the next row.
+    // Note that these column names are used by witgen to detect
+    // this machine...
+    col witness m_addr_high, m_addr_low;
+    col witness m_step_high, m_step_low;
+    col witness m_change;
+    col witness m_value1, m_value2;
+
+    link => bit12.check(m_step_high);
+    link => byte2.check(m_step_low);
+    let m_step = m_step_high * 2**16 + m_step_low;
+
+    link => byte2.check(m_value1);
+    link => byte2.check(m_value2);
+
+    // Memory operation flags
+    col witness m_is_write;
+    std::utils::force_bool(m_is_write);
+
+    // is_write can only be 1 if a selector is active
+    let is_mem_op = array::sum(selectors);
+    std::utils::force_bool(is_mem_op);
+    (1 - is_mem_op) * m_is_write = 0;
+
+    // If the next line is a not a write and we have an address change,
+    // then the value is zero.
+    (1 - m_is_write') * m_change * m_value1' = 0;
+    (1 - m_is_write') * m_change * m_value2' = 0;
+
+    // change has to be 1 in the last row, so that a first read on row zero is constrained to return 0
+    (1 - m_change) * LAST = 0;
+
+    // If the next line is a read and we stay at the same address, then the
+    // value cannot change.
+    (1 - m_is_write') * (1 - m_change) * (m_value1' - m_value1) = 0;
+    (1 - m_is_write') * (1 - m_change) * (m_value2' - m_value2) = 0;
+
+    col fixed FIRST = [1] + [0]*;
+    let LAST = FIRST';
+
+    std::utils::force_bool(m_change);
+
+    // if change is zero, addr has to stay the same.
+    (m_addr_low' - m_addr_low) * (1 - m_change) = 0;
+    (m_addr_high' - m_addr_high) * (1 - m_change) = 0;
+
+    // Except for the last row, if m_change is 1, then addr has to increase,
+    // if it is zero, step has to increase.
+    // The diff has to be equal to the difference **minus one**.
+
+    // These two helper columns have different semantics, depending on
+    // whether we're comparing addresses or time steps.
+    // In both cases, m_tmp2 needs to be of 16 Bits.
+    col witness m_tmp1, m_tmp2;
+    link => byte2.check(m_diff);
+
+    // When comparing time steps, a 28-Bit diff is sufficient assuming a maximum step
+    // of 2**28.
+    // The difference is computed on the field, which is larger than 2**28.
+    // We prove that m_step' - m_step > 0 by letting the prover provide a 28-Bit value
+    // such that claimed_diff + 1 == m_step' - m_step.
+    // Because all values are constrained to be 28-Bit, no overflow can occur.
+    let m_diff_upper = m_tmp1;
+    let m_diff_lower = m_tmp2;
+    link if (1 - m_change) => bit12.check(m_diff_upper);
+    let claimed_time_step_diff = m_diff_upper * 2**16 + m_diff_lower;
+    let actual_time_step_diff = m_step' - m_step;
+    (1 - m_change) * (claimed_time_step_diff + 1 - actual_time_step_diff) = 0;
+
+    // When comparing addresses, we let the prover indicate whether the upper or lower
+    // limb needs to be compared and then assert that the diff is positive.
+    let address_high_unequal = m_tmp1;
+    let m_diff = m_tmp2;
+
+    // address_high_unequal is binary.
+    m_change * address_high_unequal * (address_high_unequal - 1) = 0;
+
+    // Whether to do any comparison.
+    // We want to compare whenever m_change == 1, but not in the last row.
+    // Because we constrained m_change to be 1 in the last row, this will just
+    // be equal to m_change, except that the last entry is 0.
+    // (`m_change * (1 - LAST)` would be the same, but of higher degree.) 
+    let do_comparison = m_change - LAST;
+
+    // If address_high_unequal is 0, the higher limbs should be equal.
+    do_comparison * (1 - address_high_unequal) * (m_addr_high' - m_addr_high) = 0;
+
+    // Assert that m_diff stores the actual diff - 1.
+    let actual_addr_limb_diff = address_high_unequal * (m_addr_high' - m_addr_high)
+                                + (1 - address_high_unequal) * (m_addr_low' - m_addr_low);
+    do_comparison * (m_diff + 1 - actual_addr_limb_diff) = 0;
+}
diff --git a/std/machines/mod.asm b/std/machines/mod.asm
index 5aaf3389cb..a0ad45a530 100644
--- a/std/machines/mod.asm
+++ b/std/machines/mod.asm
@@ -6,6 +6,7 @@ mod binary_bb;
 mod range;
 mod hash;
 mod memory;
+mod memory16;
 mod memory_with_bootloader_write;
 mod shift;
 mod shift16;
diff --git a/std/machines/range.asm b/std/machines/range.asm
index 482c5437cf..baf154d700 100644
--- a/std/machines/range.asm
+++ b/std/machines/range.asm
@@ -57,3 +57,15 @@ machine Bit7 with
     col fixed latch = [1]*;
     col fixed operation_id = [0]*;
 }
+
+machine Bit12 with
+    latch: latch,
+    operation_id: operation_id,
+    degree: 4096
+{
+    operation check<0> BIT12 -> ;
+
+    let BIT12: col = |i| i % (2**12);
+    let latch = 1;
+    col fixed operation_id = [0]*;
+}
diff --git a/test_data/std/memory16_test.asm b/test_data/std/memory16_test.asm
new file mode 100644
index 0000000000..2af5b391ae
--- /dev/null
+++ b/test_data/std/memory16_test.asm
@@ -0,0 +1,73 @@
+use std::machines::range::Bit12;
+use std::machines::range::Byte2;
+use std::machines::memory16::Memory16;
+
+machine Main with degree: 65536 {
+    reg pc[@pc];
+    reg X1[<=];
+    reg X2[<=];
+    reg Y1[<=];
+    reg Y2[<=];
+    reg A1;
+    reg A2;
+
+    col fixed STEP(i) { i };
+    Bit12 bit12;
+    Byte2 byte2;
+    Memory16 memory(bit12, byte2);
+
+    instr mload X1, X2 -> Y1, Y2 link ~> (Y1, Y2) = memory.mload(X1, X2, STEP);
+    instr mstore X1, X2, Y1, Y2 -> link ~> memory.mstore(X1, X2, STEP, Y1, Y2);
+
+    instr assert_eq X1, Y1 {
+        X1 = Y1
+    }
+
+    function main {
+
+        // Store 4
+        mstore 0, 100, 0, 4;
+        
+        // Read uninitialized memory
+        A1, A2 <== mload(1, 100);
+        assert_eq A1, 0;
+        assert_eq A2, 0;
+
+        // Read previously stored value
+        A1, A2 <== mload(0, 100);
+        assert_eq A1, 0;
+        assert_eq A2, 4;
+
+        // Update previously stored value
+        mstore 0, 100, 0, 7;
+        mstore 0, 100, 0, 8;
+
+        // Read updated values (twice)
+        A1, A2 <== mload(0, 100);
+        assert_eq A1, 0;
+        assert_eq A2, 8;
+        A1, A2 <== mload(0, 100);
+        assert_eq A1, 0;
+        assert_eq A2, 8;
+
+        // Write to previously uninitialized memory cell
+        mstore 1, 100, 0, 1234;
+        A1, A2 <== mload(1, 100);
+        assert_eq A1, 0;
+        assert_eq A2, 1234;
+
+        // Write max value
+        mstore 0, 200, 0xffff, 0xffff;
+        A1, A2 <== mload(0, 200);
+        assert_eq A1, 0xffff;
+        assert_eq A2, 0xffff;
+
+        // Store at maximal address
+        mstore 0xffff, 0xfffc, 1, 0;
+        A1, A2 <== mload(0xffff, 0xfffc);
+        assert_eq A1, 1;
+        assert_eq A2, 0;
+
+        return;
+    }
+}