Add Phase 4: length reasoning and canonize fix

- Length axiom generation: concat decomposition, unit=1, empty=0, string=k - Bidirectional len(x)=0 <-> x="" axioms - check_zero_length: propagate zero-length variables to empty - propagate_length_eqs: equate lengths of equation sides - Fixed canonize: decompose structurally first, then follow e-graph roots Prevents both equation sides resolving to same root expression - Fixed branch_var_prefix: skip cyclic candidates (containing variable itself) and ground candidates with wrong length - check_length_conflict prevents e-graph corruption from incompatible merges - Fresh value pinning for model generation (m_fresh_values) - Regression: 36/62 correct, 0 wrong, 26 unknown on string-concat.smt2 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-15 05:15:41 +00:00 · 2026-02-27 22:48:58 -08:00 · 2026-02-27 22:48:58 -08:00 · 4b58d57b61
commit 4b58d57b61
parent f48040d809
5 changed files with 259 additions and 25 deletions
--- a/spec/log.txt
+++ b/spec/log.txt
--- a/src/smt/nseq_state.cpp
+++ b/src/smt/nseq_state.cpp
@ -28,7 +28,8 @@ namespace smt {
          m_eq_id(0),
          m_axioms(m),
          m_axioms_head(0),
-          m_length_apps(m) {
+          m_length_apps(m),
+          m_length_exprs(m) {
    }

    void nseq_state::push_scope() {
@ -56,6 +57,7 @@ namespace smt {
        m_axioms_head = 0;
        m_has_length.reset();
        m_length_apps.reset();
+        m_length_exprs.reset();
        m_eq_id = 0;
    }

@ -99,8 +101,10 @@ namespace smt {
        if (m_has_length.contains(e))
            return;
        m_length_apps.push_back(len_app);
+        m_length_exprs.push_back(e);
        m_has_length.insert(e);
        ts.push(push_back_vector<expr_ref_vector>(m_length_apps));
+        ts.push(push_back_vector<expr_ref_vector>(m_length_exprs));
        ts.push(insert_obj_trail<expr>(m_has_length, e));
    }

--- a/src/smt/nseq_state.h
+++ b/src/smt/nseq_state.h
@ -71,6 +71,7 @@ namespace smt {
        // Length tracking
        obj_hashtable<expr>      m_has_length;
        expr_ref_vector          m_length_apps;
+        expr_ref_vector          m_length_exprs;  // corresponding string exprs

        // Trail for undo
        trail_stack              m_trail;
@ -107,6 +108,27 @@ namespace smt {
        // Length tracking
        bool has_length(expr* e) const { return m_has_length.contains(e); }
        void add_length(expr* len_app, expr* e, trail_stack& ts);
+        unsigned length_count() const { return m_length_apps.size(); }
+
+        struct length_pair { expr* len_app; expr* str_expr; };
+        class length_pair_iter {
+            expr_ref_vector const& m_apps;
+            expr_ref_vector const& m_exprs;
+        public:
+            length_pair_iter(expr_ref_vector const& a, expr_ref_vector const& e) : m_apps(a), m_exprs(e) {}
+            struct iterator {
+                expr_ref_vector const& m_apps;
+                expr_ref_vector const& m_exprs;
+                unsigned m_idx;
+                iterator(expr_ref_vector const& a, expr_ref_vector const& e, unsigned i) : m_apps(a), m_exprs(e), m_idx(i) {}
+                length_pair operator*() const { return { m_apps[m_idx], m_exprs[m_idx] }; }
+                iterator& operator++() { ++m_idx; return *this; }
+                bool operator!=(iterator const& o) const { return m_idx != o.m_idx; }
+            };
+            iterator begin() const { return iterator(m_apps, m_exprs, 0); }
+            iterator end() const { return iterator(m_apps, m_exprs, m_apps.size()); }
+        };
+        length_pair_iter length_pairs() const { return length_pair_iter(m_length_apps, m_length_exprs); }

        // Accessors
        scoped_vector<nseq_eq> const& eqs() const { return m_eqs; }
--- a/src/smt/theory_nseq.cpp
+++ b/src/smt/theory_nseq.cpp
@ -36,7 +36,8 @@ namespace smt {
          m_nielsen(ctx.get_manager(), m_seq_rewrite),
          m_find(*this),
          m_has_seq(false),
-          m_new_propagation(false) {
+          m_new_propagation(false),
+          m_fresh_values(ctx.get_manager()) {
    }

    theory_nseq::~theory_nseq() {
@ -71,8 +72,15 @@ namespace smt {
        if (solve_eqs())
            return FC_CONTINUE;

+        // Propagate length equalities from equations
+        if (propagate_length_eqs())
+            return FC_CONTINUE;
+
+        // Check zero-length variables
+        if (check_zero_length())
+            return FC_CONTINUE;
+
        // TODO: implement regex membership checking
-        // TODO: implement length/Parikh reasoning

        if (all_eqs_solved() && m_state.mems().empty())
            return FC_DONE;
@ -89,6 +97,7 @@ namespace smt {
    }

    bool theory_nseq::internalize_term(app* term) {
+        TRACE(seq, tout << "internalize_term: " << mk_bounded_pp(term, m, 3) << "\n";);
        if (!m_has_seq) {
            ctx.push_trail(value_trail<bool>(m_has_seq));
            m_has_seq = true;
@ -152,6 +161,7 @@ namespace smt {
    }

    void theory_nseq::apply_sort_cnstr(enode* n, sort* s) {
+        TRACE(seq, tout << "apply_sort_cnstr: " << mk_bounded_pp(n->get_expr(), m, 3) << "\n";);
        mk_var(n);
    }

@ -255,8 +265,10 @@ namespace smt {
    }

    void theory_nseq::relevant_eh(app* n) {
-        if (m_util.str.is_length(n))
+        if (m_util.str.is_length(n)) {
            add_length(n);
+            enque_axiom(n);
+        }

        // Enqueue axioms for operations that need reduction
        if (m_util.str.is_index(n)   ||
@ -280,8 +292,9 @@ namespace smt {
    }

    void theory_nseq::deque_axiom(expr* e) {
-        // TODO: generate axioms for string operations
        TRACE(seq, tout << "deque axiom: " << mk_bounded_pp(e, m) << "\n";);
+        if (m_util.str.is_length(e))
+            add_length_axiom(e);
    }

    void theory_nseq::add_axiom(literal l1, literal l2, literal l3, literal l4, literal l5) {
@ -374,30 +387,70 @@ namespace smt {
        return expr_ref(m_util.str.mk_length(s), m);
    }

+    // Ensure that a string expression has its length axiom.
+    // Called when equation solving creates or uses new string terms.
+    void theory_nseq::ensure_length_axiom(expr* e) {
+        if (!m_util.is_seq(e))
+            return;
+        expr_ref len(m_util.str.mk_length(e), m);
+        ensure_enode(len);
+        if (!m_state.has_length(e)) {
+            m_state.add_length(len, e, m_state.trail());
+            enque_axiom(len);
+        }
+    }
+
    expr_ref theory_nseq::mk_concat(expr_ref_vector const& es, sort* s) {
        SASSERT(!es.empty());
        return expr_ref(m_util.str.mk_concat(es.size(), es.data(), s), m);
    }

-    // Canonize an equation side using current e-graph equivalences.
+    // Check if a propagation x = t would create a length inconsistency.
+    // If x has a known fixed length that differs from t's minimum length, return true (conflict).
+    bool theory_nseq::check_length_conflict(expr* x, expr_ref_vector const& es, nseq_dependency* dep) {
+        rational x_lo, x_hi;
+        expr_ref x_len = mk_len(x);
+        if (!ctx.e_internalized(x_len))
+            return false;
+        if (!lower_bound(x_len, x_lo) || !upper_bound(x_len, x_hi))
+            return false;
+        // Compute minimum length of the other side
+        rational min_len(0);
+        for (expr* e : es) {
+            zstring s;
+            if (m_util.str.is_string(e, s))
+                min_len += rational(s.length());
+            else if (m_util.str.is_unit(e))
+                min_len += rational(1);
+            // Variables contribute ≥ 0, so min remains unchanged
+        }
+        if (x_hi < min_len) {
+            // x's max length is less than the minimum length of the term
+            set_conflict(dep);
+            return true;
+        }
+        return false;
+    }
    // Replaces each element with its canonical representative, decomposing
    // concatenations and string constants as needed.
    bool theory_nseq::canonize(expr_ref_vector const& src, expr_ref_vector& dst,
                                nseq_dependency*& dep) {
        dst.reset();
-        for (expr* e : src) {
-            if (!ctx.e_internalized(e)) {
-                dst.push_back(e);
+        // Step 1: structurally decompose each src expression
+        expr_ref_vector units(m);
+        for (expr* e : src)
+            m_util.str.get_concat_units(e, units);
+        // Step 2: follow e-graph roots for each atomic component
+        for (expr* u : units) {
+            if (!ctx.e_internalized(u)) {
+                dst.push_back(u);
                continue;
            }
-            enode* n = ctx.get_enode(e);
+            enode* n = ctx.get_enode(u);
            enode* r = n->get_root();
            expr* re = r->get_expr();
-            if (re != e) {
-                // Track the dependency for the equality
+            if (re != u)
                dep = m_state.mk_join(dep, m_state.mk_dep(n, r));
-            }
-            // Decompose the canonical representative into concat components
            m_util.str.get_concat_units(re, dst);
        }
        return true;
@ -408,14 +461,11 @@ namespace smt {
        for (auto const& eq : m_state.eqs()) {
            expr_ref_vector lhs(m), rhs(m);
            nseq_dependency* dep = eq.dep();
-            if (!canonize(eq.lhs(), lhs, dep) || !canonize(eq.rhs(), rhs, dep))
+            if (!canonize(eq.lhs(), lhs, dep))
+                return false;
+            if (!canonize(eq.rhs(), rhs, dep))
                return false;
            seq::nielsen_result result = m_nielsen.simplify(lhs, rhs);
-            TRACE(seq, tout << "all_eqs_solved [" << eq.id() << "]: ";
-                  for (auto* e : lhs) tout << mk_bounded_pp(e, m, 2) << " ";
-                  tout << "= ";
-                  for (auto* e : rhs) tout << mk_bounded_pp(e, m, 2) << " ";
-                  tout << " -> " << (int)result << "\n";);
            if (result != seq::nielsen_result::solved)
                return false;
        }
@ -441,6 +491,7 @@ namespace smt {
        if (!canonize(eq.lhs(), lhs, dep) || !canonize(eq.rhs(), rhs, dep))
            return false;

+
        TRACE(seq, tout << "solve_eq [" << eq.id() << "]: ";
              for (auto* e : lhs) tout << mk_bounded_pp(e, m, 2) << " ";
              tout << "= ";
@ -455,13 +506,19 @@ namespace smt {
            // Propagate solved form: either both empty, var = empty, or var = ground term
            bool changed = false;
            if (lhs.size() == 1 && m_nielsen.is_var(lhs.get(0)) && !rhs.empty()) {
+                if (check_length_conflict(lhs.get(0), rhs, dep))
+                    return true;
                sort* s = lhs.get(0)->get_sort();
                expr_ref rhs_concat = mk_concat(rhs, s);
+                ensure_length_axiom(rhs_concat);
                changed = propagate_eq(dep, lhs.get(0), rhs_concat);
            }
            else if (rhs.size() == 1 && m_nielsen.is_var(rhs.get(0)) && !lhs.empty()) {
+                if (check_length_conflict(rhs.get(0), lhs, dep))
+                    return true;
                sort* s = rhs.get(0)->get_sort();
                expr_ref lhs_concat = mk_concat(lhs, s);
+                ensure_length_axiom(lhs_concat);
                changed = propagate_eq(dep, rhs.get(0), lhs_concat);
            }
            else {
@ -490,6 +547,8 @@ namespace smt {
                sort* s = lhs[0]->get_sort();
                expr_ref l = mk_concat(lhs, s);
                expr_ref r = mk_concat(rhs, s);
+                ensure_length_axiom(l);
+                ensure_length_axiom(r);
                changed = propagate_eq(dep, l, r);
            }
            else {
@ -574,8 +633,37 @@ namespace smt {
        // For x starting one side, try x = prefix of other side
        // x = "" was already tried (assigned false)
        // Now enumerate: x = other[0], x = other[0]·other[1], ...
+        rational x_len;
+        bool has_x_len = get_length(x, x_len);
+
+        // Find the range of valid prefix lengths
+        // Stop before we include x itself (cyclic candidate)
+        unsigned max_prefix = other.size();
+        enode* x_root = ctx.get_enode(x)->get_root();
+        for (unsigned i = 0; i < other.size(); ++i) {
+            if (ctx.e_internalized(other[i]) &&
+                ctx.get_enode(other[i])->get_root() == x_root) {
+                max_prefix = i;
+                break;
+            }
+        }
+        if (max_prefix == 0)
+            return false;
+
        expr_ref candidate(m);
-        for (unsigned len = 1; len <= other.size(); ++len) {
+        unsigned min_len = 0;
+        bool has_var = false;
+        for (unsigned len = 1; len <= max_prefix; ++len) {
+            min_len += m_util.str.min_length(other[len - 1]);
+            has_var = has_var || m_nielsen.is_var(other[len - 1]);
+            // Skip if candidate length is incompatible with variable length
+            if (has_x_len) {
+                if (rational(min_len) > x_len)
+                    continue;
+                // For ground candidates (no variables), exact length must match
+                if (!has_var && rational(min_len) != x_len)
+                    continue;
+            }
            if (len == 1)
                candidate = other[0];
            else
@ -584,12 +672,14 @@ namespace smt {
            switch (ctx.get_assignment(eq)) {
            case l_undef:
                TRACE(seq, tout << "branch " << mk_bounded_pp(x, m) << " = " << mk_bounded_pp(candidate, m) << "\n";);
+                ensure_length_axiom(candidate);
                ctx.force_phase(eq);
                ctx.mark_as_relevant(eq);
                m_new_propagation = true;
                m_state.stats().m_num_splits++;
                return true;
            case l_true:
+                ensure_length_axiom(candidate);
                propagate_eq(dep, x, candidate);
                return true;
            case l_false:
@ -599,6 +689,109 @@ namespace smt {
        return false;
    }

+    // -------------------------------------------------------
+    // Length reasoning
+    // -------------------------------------------------------
+
+    void theory_nseq::add_length_axiom(expr* n) {
+        expr* x = nullptr;
+        VERIFY(m_util.str.is_length(n, x));
+        if (m_util.str.is_concat(x) && to_app(x)->get_num_args() != 0) {
+            // len(x1 ++ x2 ++ ...) = len(x1) + len(x2) + ...
+            ptr_vector<expr> args;
+            for (auto arg : *to_app(x))
+                args.push_back(m_util.str.mk_length(arg));
+            expr_ref len_sum(m_autil.mk_add(args.size(), args.data()), m);
+            add_axiom(mk_eq(len_sum, n, false));
+        }
+        else if (m_util.str.is_unit(x)) {
+            // len(unit(c)) = 1
+            add_axiom(mk_eq(n, m_autil.mk_int(1), false));
+        }
+        else if (m_util.str.is_empty(x)) {
+            // len("") = 0
+            add_axiom(mk_eq(n, m_autil.mk_int(0), false));
+        }
+        else {
+            zstring s;
+            if (m_util.str.is_string(x, s)) {
+                // len("abc") = 3
+                add_axiom(mk_eq(n, m_autil.mk_int(s.length()), false));
+            }
+            else {
+                // len(x) >= 0
+                add_axiom(mk_literal(m_autil.mk_ge(n, m_autil.mk_int(0))));
+                // len(x) = 0 ↔ x = ""
+                expr_ref emp(m_util.str.mk_empty(x->get_sort()), m);
+                literal len_zero = mk_eq(n, m_autil.mk_int(0), false);
+                literal x_empty = mk_eq(x, emp, false);
+                add_axiom(~len_zero, x_empty);   // len(x) = 0 → x = ""
+                add_axiom(len_zero, ~x_empty);    // x = "" → len(x) = 0
+            }
+        }
+    }
+
+    bool theory_nseq::get_length(expr* e, rational& val) {
+        expr_ref len = mk_len(e);
+        return m_arith_value.get_value(len, val);
+    }
+
+    bool theory_nseq::lower_bound(expr* e, rational& lo) {
+        bool strict;
+        return m_arith_value.get_lo(e, lo, strict) && !strict;
+    }
+
+    bool theory_nseq::upper_bound(expr* e, rational& hi) {
+        bool strict;
+        return m_arith_value.get_up(e, hi, strict) && !strict;
+    }
+
+    bool theory_nseq::check_zero_length() {
+        bool progress = false;
+        for (auto const& [len_app, str_expr] : m_state.length_pairs()) {
+            rational lo, hi;
+            if (lower_bound(len_app, lo) && upper_bound(len_app, hi) &&
+                lo.is_zero() && hi.is_zero()) {
+                // len(e) = 0 → e = ""
+                expr* e = str_expr;
+                expr_ref emp(m_util.str.mk_empty(e->get_sort()), m);
+                enode* n = ensure_enode(e);
+                enode* n_emp = ensure_enode(emp);
+                if (n->get_root() != n_emp->get_root()) {
+                    literal len_zero = mk_eq(len_app, m_autil.mk_int(0), false);
+                    nseq_dependency* dep = m_state.mk_dep(len_zero);
+                    propagate_eq(dep, e, emp);
+                    progress = true;
+                }
+            }
+        }
+        return progress;
+    }
+
+    bool theory_nseq::propagate_length_eqs() {
+        bool progress = false;
+        for (auto const& eq : m_state.eqs()) {
+            expr_ref_vector const& lhs = eq.lhs();
+            expr_ref_vector const& rhs = eq.rhs();
+            if (lhs.empty() || rhs.empty())
+                continue;
+            sort* s = lhs[0]->get_sort();
+            expr_ref l = mk_concat(lhs, s);
+            expr_ref r = mk_concat(rhs, s);
+            expr_ref len_l = mk_len(l);
+            expr_ref len_r = mk_len(r);
+            if (ctx.e_internalized(len_l) && ctx.e_internalized(len_r)) {
+                enode* nl = ctx.get_enode(len_l);
+                enode* nr = ctx.get_enode(len_r);
+                if (nl->get_root() != nr->get_root()) {
+                    propagate_eq(eq.dep(), len_l, len_r, false);
+                    progress = true;
+                }
+            }
+        }
+        return progress;
+    }
+
    // -------------------------------------------------------
    // Display and statistics
    // -------------------------------------------------------
@ -626,7 +819,6 @@ namespace smt {

    model_value_proc* theory_nseq::mk_value(enode* n, model_generator& mg) {
        app* e = n->get_expr();
-        TRACE(seq, tout << "mk_value: " << mk_bounded_pp(e, m) << "\n";);

        if (m_util.is_seq(e)) {
            // Walk the equivalence class looking for a concrete string value
@ -635,18 +827,22 @@ namespace smt {
            do {
                expr* ce = curr->get_expr();
                zstring s;
-                if (m_util.str.is_string(ce, s))
+                if (m_util.str.is_string(ce, s)) {
                    return alloc(expr_wrapper_proc, to_app(ce));
-                if (m_util.str.is_empty(ce))
+                }
+                if (m_util.str.is_empty(ce)) {
                    return alloc(expr_wrapper_proc, to_app(ce));
+                }
                curr = curr->get_next();
            } while (curr != root);

            // No concrete value found: use seq_factory to get a fresh value
            expr_ref val(m);
            val = mg.get_model().get_fresh_value(e->get_sort());
-            if (val)
+            if (val) {
+                m_fresh_values.push_back(val);
                return alloc(expr_wrapper_proc, to_app(val));
+            }
            // Fallback: empty string
            return alloc(expr_wrapper_proc, to_app(m_util.str.mk_empty(e->get_sort())));
        }
@ -662,6 +858,7 @@ namespace smt {
    }

    void theory_nseq::finalize_model(model_generator& mg) {
+        m_fresh_values.reset();
    }
 }

--- a/src/smt/theory_nseq.h
+++ b/src/smt/theory_nseq.h
@ -44,6 +44,7 @@ namespace smt {
        nseq_union_find  m_find;
        bool             m_has_seq;
        bool             m_new_propagation;
+        expr_ref_vector  m_fresh_values;  // keep model fresh values alive

        // Theory interface
        final_check_status final_check_eh(unsigned) override;
@ -83,6 +84,7 @@ namespace smt {

        // Helpers
        void add_length(expr* l);
+        void ensure_length_axiom(expr* e);
        literal mk_literal(expr* e);
        literal mk_eq_empty(expr* e, bool phase = true);
        expr_ref mk_len(expr* s);
@ -96,6 +98,15 @@ namespace smt {
        bool branch_var_prefix(expr* x, expr_ref_vector const& other, nseq_dependency* dep);
        bool canonize(expr_ref_vector const& src, expr_ref_vector& dst, nseq_dependency*& dep);
        bool all_eqs_solved();
+        bool check_length_conflict(expr* x, expr_ref_vector const& es, nseq_dependency* dep);
+
+        // Length reasoning
+        void add_length_axiom(expr* n);
+        bool check_zero_length();
+        bool propagate_length_eqs();
+        bool get_length(expr* e, rational& val);
+        bool lower_bound(expr* e, rational& lo);
+        bool upper_bound(expr* e, rational& hi);

    public:
        theory_nseq(context& ctx);