From 352b14fe2b26efab7dd78ff51563cdb8ffed0d23 Mon Sep 17 00:00:00 2001
From: Nikolaj Bjorner <nbjorner@microsoft.com>
Date: Tue, 21 Apr 2026 09:16:00 +0200
Subject: [PATCH] fix and optimize not-contains and regex equalities

Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>
---
 src/ast/rewriter/seq_axioms.cpp | 12 ++++++++
 src/smt/seq/seq_nielsen.cpp     | 44 +++++++++------------------
 src/smt/seq/seq_nielsen.h       | 18 ++++++++---
 src/smt/theory_nseq.cpp         | 54 ++++++++++++++++++++++++++++-----
 4 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/src/ast/rewriter/seq_axioms.cpp b/src/ast/rewriter/seq_axioms.cpp
index ce2070cfd..664d5d352 100644
--- a/src/ast/rewriter/seq_axioms.cpp
+++ b/src/ast/rewriter/seq_axioms.cpp
@@ -1353,11 +1353,23 @@ namespace seq {
         Create the recursive function not_contains that implements this recursive definition.
         Add the axiom
             contains(a, b) or not_contains(a, b)
+
+      if b is a string (or concatentation of characters/unit values) use regex shortcut:
+
+      ~contain(a, b) => a not in .*b.*
     */
 
     void axioms::not_contains_axiom(expr *e) {
         expr* _a = nullptr, *_b = nullptr;
         VERIFY(seq.str.is_contains(e, _a, _b));
+        if (seq.str.is_string(_b)) {
+            auto star = seq.re.mk_full_seq(seq.re.mk_re(_b->get_sort()));
+            auto b_re = seq.re.mk_to_re(_b);
+            auto r = seq.re.mk_concat(star, seq.re.mk_concat(b_re, star));
+            auto mem = seq.re.mk_in_re(_a, r);
+            add_clause(expr_ref(e, m), ~expr_ref(mem, m));
+            return;
+        }
         auto ca = purify(_a);
         auto cb = purify(_b);
         sort* srt = ca->get_sort();
diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp
index ee8eda179..b6be9b93c 100644
--- a/src/smt/seq/seq_nielsen.cpp
+++ b/src/smt/seq/seq_nielsen.cpp
@@ -866,8 +866,7 @@ namespace seq {
 
             // pass 2: detect symbol clashes, empty-propagation, and prefix cancellation
             for (str_eq& eq : m_str_eq) {
-                if (!eq.m_lhs || !eq.m_rhs)
-                    continue;
+                SASSERT(eq.well_formed());
 
                 // one side empty, the other not empty => conflict check
                 // (the actual substitution is done in apply_det_modifier)
@@ -1915,8 +1914,7 @@ namespace seq {
         for (str_eq const& eq : node->str_eqs()) {
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
+            SASSERT(eq.well_formed());
             for (unsigned od = 0; od < 2; ++od) {
                 bool fwd = (od == 0);
                 euf::snode* lhead = dir_token(eq.m_lhs, fwd);
@@ -1951,8 +1949,7 @@ namespace seq {
         for (str_eq const& eq : node->str_eqs()) {
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
+            SASSERT(eq.well_formed());
             for (unsigned od = 0; od < 2; ++od) {
                 bool fwd = od == 0;
                 euf::snode* lhead = dir_token(eq.m_lhs, fwd);
@@ -2189,8 +2186,7 @@ namespace seq {
             str_eq const& eq = node->str_eqs()[eq_idx];
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
+            SASSERT(eq.well_formed());
             // EqSplit only applies to regex-free equations.
             if (!eq.m_lhs->is_regex_free() || !eq.m_rhs->is_regex_free())
                 continue;
@@ -2368,8 +2364,7 @@ namespace seq {
         for (str_eq const& eq : node->str_eqs()) {
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
+            SASSERT(eq.well_formed());
             euf::snode_vector toks;
             eq.m_lhs->collect_tokens(toks);
             for (euf::snode* t : toks) {
@@ -2399,8 +2394,7 @@ namespace seq {
         for (str_eq const& eq : node->str_eqs()) {
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
+            SASSERT(eq.well_formed());
             for (unsigned od = 0; od < 2; ++od) {
                 bool local_fwd = (od == 0);
                 euf::snode* lhead = dir_token(eq.m_lhs, local_fwd);
@@ -2439,7 +2433,7 @@ namespace seq {
                                           str_eq const*& eq_out,
                                           bool& fwd) const {
         for (str_eq const& eq : node->str_eqs()) {
-            SASSERT(eq.m_lhs && eq.m_rhs && !eq.is_trivial());
+            SASSERT(eq.well_formed() && !eq.is_trivial());
 
             for (unsigned od = 0; od < 2; ++od) {
                 bool local_fwd = (od == 0);
@@ -2506,8 +2500,7 @@ namespace seq {
         for (str_eq const& eq : node->str_eqs()) {
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
+            SASSERT(eq.well_formed());
             if (eq.m_lhs->is_empty() && eq.m_rhs->first() && eq.m_rhs->first()->is_power()) {
                 power = eq.m_rhs->first();
                 dep = eq.m_dep;
@@ -2560,10 +2553,9 @@ namespace seq {
 
         // Look for two directional endpoint power tokens with the same base.
         for (str_eq const& eq : node->str_eqs()) {
+            SASSERT(eq.well_formed());
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
             for (unsigned od = 0; od < 2; ++od) {
                 bool fwd = (od == 0);
                 euf::snode* lhead = dir_token(eq.m_lhs, fwd);
@@ -2619,12 +2611,9 @@ namespace seq {
     bool nielsen_graph::apply_split_power_elim(nielsen_node* node) {
 
         for (str_eq const& eq : node->str_eqs()) {
+            SASSERT(eq.well_formed());
             if (eq.is_trivial())
                 continue;
-            // NB: Shuvendu - this test is always false based on how str_eqs are constructed
-            // it can be an assertion to force the invariant that str_eqs always have both sides non-null.          
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
 
             for (int side = 0; side < 2; ++side) {
                 euf::snode* pow_side   = (side == 0) ? eq.m_lhs : eq.m_rhs;
@@ -2760,10 +2749,9 @@ namespace seq {
 
     bool nielsen_graph::apply_gpower_intr(nielsen_node* node) {
         for (str_eq const& eq : node->str_eqs()) {
+            SASSERT(eq.well_formed());
             if (eq.is_trivial())
                 continue;
-            if (!eq.m_lhs || !eq.m_rhs)
-                continue;
 
             // Try both directions (ZIPT's ExtendDir(fwd=true/false)).
             for (unsigned od = 0; od < 2; ++od) {
@@ -3896,7 +3884,7 @@ namespace seq {
         // Step 1: Compute LHS (|x|) for each non-eliminating substitution
         // using current m_mod_cnt (before bumping).
         // Also assert |x|_k >= 0 (mirrors ZIPT's NielsenNode constructor line 172).
-        svector<std::pair<unsigned, expr*>> lhs_exprs;
+        vector<std::pair<unsigned, expr_ref>> lhs_exprs;
         for (unsigned i = 0; i < substs.size(); ++i) {
             auto const& s = substs[i];
             if (!s.m_var->is_var())
@@ -3904,7 +3892,7 @@ namespace seq {
             if (!m_seq.is_seq(s.m_var->get_expr()))
                 continue;
             expr_ref lhs = compute_length_expr(s.m_var);
-            lhs_exprs.push_back({i, lhs.get()});
+            lhs_exprs.push_back({i, lhs});
             if (s.is_eliminating())
                 continue;
             has_non_elim = true;
@@ -3918,12 +3906,10 @@ namespace seq {
 
 
         // Step 3: Compute RHS (|u|) with bumped mod counts and add |x| = |u|.
-        for (auto const& p : lhs_exprs) {
-            unsigned idx = p.first;
-            expr* lhs_expr = p.second;
+        for (auto const &[idx, lhs] : lhs_exprs) {
             auto const& s = substs[idx];
             expr_ref rhs = compute_length_expr(s.m_replacement);
-            e->add_side_constraint(mk_constraint(m.mk_eq(lhs_expr, rhs), s.m_dep));
+            e->add_side_constraint(mk_constraint(m.mk_eq(lhs, rhs), s.m_dep));
         }
 
         // Step 4: Restore mod counts (temporary bump for computing RHS only).
diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h
index 3212449af..0f58c7d45 100644
--- a/src/smt/seq/seq_nielsen.h
+++ b/src/smt/seq/seq_nielsen.h
@@ -371,10 +371,10 @@ namespace seq {
         euf::snode* m_rhs;
         dep_tracker m_dep;
 
-
-        str_eq() = default;
-        str_eq(euf::snode* lhs, euf::snode* rhs, dep_tracker const& dep):
-            m_lhs(lhs), m_rhs(rhs), m_dep(dep) {}
+        str_eq(euf::snode* lhs, euf::snode* rhs, dep_tracker const& dep): 
+            m_lhs(lhs), m_rhs(rhs), m_dep(dep) {
+            SASSERT(well_formed());
+        }
 
         bool operator==(str_eq const& other) const {
             return m_lhs == other.m_lhs && m_rhs == other.m_rhs;
@@ -388,6 +388,10 @@ namespace seq {
 
         // check if the constraint contains a given variable
         bool contains_var(euf::snode* var) const;
+
+        bool well_formed() const {
+            return !!m_lhs && !!m_rhs;
+        }
     };
 
     struct eq_pp {
@@ -407,7 +411,7 @@ namespace seq {
         euf::snode* m_regex;
         dep_tracker m_dep;
 
-        str_mem(): m_str(nullptr), m_regex(nullptr), m_dep(nullptr) {}
+        // str_mem(): m_str(nullptr), m_regex(nullptr), m_dep(nullptr) {}
         str_mem(euf::snode* str, euf::snode* regex, dep_tracker const& dep):
             m_str(str), m_regex(regex), m_dep(dep) {}
 
@@ -425,6 +429,10 @@ namespace seq {
 
         // check if the constraint contains a given variable
         bool contains_var(euf::snode* var) const;
+
+        bool well_formed() const {
+            return !!m_str && !!m_regex;
+        }
     };
 
     struct mem_pp {
diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp
index 9ea9fe874..3fcb17adf 100644
--- a/src/smt/theory_nseq.cpp
+++ b/src/smt/theory_nseq.cpp
@@ -226,14 +226,33 @@ namespace smt {
 
     void theory_nseq::new_eq_eh(theory_var v1, theory_var v2) {
         try {
-            expr* e1 = get_enode(v1)->get_expr();
-            expr* e2 = get_enode(v2)->get_expr();
+            auto n1 = get_enode(v1);
+            auto n2 = get_enode(v2);
+            auto e1 = n1->get_expr();
+            auto e2 = n2->get_expr();
             TRACE(seq, tout << mk_pp(e1, m) << " == " << mk_pp(e2, m) << "\n");
             if (m_seq.is_re(e1)) {
+                seq_rewriter rw(m);
+                expr_ref s(m), r(m);
+                r = m_seq.re.mk_union(m_seq.re.mk_diff(e1, e2), m_seq.re.mk_diff(e2, e1));
+                switch (rw.some_seq_in_re(r, s)) { 
+                case l_false:
+                    // regexes are equivalent: nothing to do
+                    return;
+                case l_true: {
+                    // regexes are disjoint: conflict
+                    enode_pair_vector eqs;
+                    literal_vector lits;
+                    eqs.push_back({n1, n2});
+                    set_conflict(eqs, lits);
+                    return;
+                }
+                default: break;
+                }
                 push_unhandled_pred();
                 return;
             }
-            if (!m_seq.is_seq(e1) || !m_seq.is_seq(e2))
+            if (!m_seq.is_seq(e1))
                 return;
             euf::snode* s1 = get_snode(e1);
             euf::snode* s2 = get_snode(e2);
@@ -252,12 +271,31 @@ namespace smt {
     }
 
     void theory_nseq::new_diseq_eh(theory_var v1, theory_var v2) {
-        expr* e1 = get_enode(v1)->get_expr();
-        expr* e2 = get_enode(v2)->get_expr();
+        auto n1 = get_enode(v1);
+        auto n2 = get_enode(v2);
+        auto e1 = n1->get_expr();
+        auto e2 = n2->get_expr();
         TRACE(seq, tout << mk_pp(e1, m) << " != " << mk_pp(e2, m) << "\n");
-        if (m_seq.is_re(e1))
-            // regex disequality: nseq cannot verify language non-equivalence
-            push_unhandled_pred();
+        if (m_seq.is_re(e1)) {
+            seq_rewriter rw(m);
+            expr_ref s(m), r(m);
+            r = m_seq.re.mk_union(m_seq.re.mk_diff(e1, e2), m_seq.re.mk_diff(e2, e1));
+            switch (rw.some_seq_in_re(r, s)) {
+            case l_false: {
+                auto lit = mk_eq(e1, e2, false);
+                literal_vector lits;
+                lits.push_back(lit);
+                ctx.mk_th_axiom(get_id(), lits.size(), lits.data());
+                break;
+            }
+            case l_true: 
+                // the regexes are different
+                break;
+            case l_undef:                
+                push_unhandled_pred();
+                break;
+            }
+        }
         else if (m_seq.is_seq(e1) && !m_no_diseq_set.contains(e1) && !m_no_diseq_set.contains(e2))
             m_axioms.diseq_axiom(e1, e2);
         else