Merge pull request #1562 from mtrberzi/regex-develop

automata-based regex engine for Z3str3
2025-08-02 17:30:23 +00:00 · 2018-04-13 16:33:45 +08:00 · 2018-04-13 16:33:45 +08:00 · f1c51982f8
commit f1c51982f8
parent 28fbcd7687 3cfb32cd2d
7 changed files with 1919 additions and 13 deletions
--- a/src/ast/rewriter/seq_rewriter.cpp
+++ b/src/ast/rewriter/seq_rewriter.cpp
@ -200,6 +200,9 @@ void re2automaton::set_solver(expr_solver* solver) {
    m_sa = alloc(symbolic_automata_t, sm, *m_ba.get());
 }

+eautomaton* re2automaton::mk_product(eautomaton* a1, eautomaton* a2) {
+    return m_sa->mk_product(*a1, *a2);
+}

 eautomaton* re2automaton::operator()(expr* e) { 
    eautomaton* r = re2aut(e); 
--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@ -53,7 +53,9 @@ public:
    bool is_range() const { return m_ty == t_range; }
    sort* get_sort() const { return m_sort; }
    expr* get_char() const { SASSERT(is_char()); return m_t; }
-
+    expr* get_pred() const { SASSERT(is_pred()); return m_t; }
+    expr* get_lo() const { SASSERT(is_range()); return m_t; }
+    expr* get_hi() const { SASSERT(is_range()); return m_s; }
 };

 class sym_expr_manager {
@ -87,6 +89,7 @@ public:
    ~re2automaton();
    eautomaton* operator()(expr* e);
    void set_solver(expr_solver* solver);
+    eautomaton* mk_product(eautomaton *a1, eautomaton *a2);
 };

 /**
--- a/src/smt/params/smt_params_helper.pyg
+++ b/src/smt/params/smt_params_helper.pyg
@ -79,6 +79,12 @@ def_module_params(module_name='smt',
                          ('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'),
                          ('str.finite_overlap_models', BOOL, False, 'attempt a finite model search for overlapping variables instead of completely giving up on the arrangement'),
                          ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'),
+                          ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'),
+                          ('str.regex_automata_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex automata heuristics'),
+                          ('str.regex_automata_intersection_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex intersection heuristics'),
+                          ('str.regex_automata_failed_automaton_threshold', UINT, 10, 'number of failed automaton construction attempts after which a full automaton is automatically built'),
+                          ('str.regex_automata_failed_intersection_threshold', UINT, 10, 'number of failed automaton intersection attempts after which intersection is always computed'),
+                          ('str.regex_automata_length_attempt_threshold', UINT, 10, 'number of length/path constraint attempts before checking unsatisfiability of regex terms'),
                          ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'),
                          ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'),
                          ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'),
--- a/src/smt/params/theory_str_params.cpp
+++ b/src/smt/params/theory_str_params.cpp
@ -31,4 +31,10 @@ void theory_str_params::updt_params(params_ref const & _p) {
    m_UseBinarySearch = p.str_use_binary_search();
    m_BinarySearchInitialUpperBound = p.str_binary_search_start();
    m_OverlapTheoryAwarePriority = p.str_overlap_priority();
+    m_RegexAutomata = p.str_regex_automata();
+    m_RegexAutomata_DifficultyThreshold = p.str_regex_automata_difficulty_threshold();
+    m_RegexAutomata_IntersectionDifficultyThreshold = p.str_regex_automata_intersection_difficulty_threshold();
+    m_RegexAutomata_FailedAutomatonThreshold = p.str_regex_automata_failed_automaton_threshold();
+    m_RegexAutomata_FailedIntersectionThreshold = p.str_regex_automata_failed_intersection_threshold();
+    m_RegexAutomata_LengthAttemptThreshold = p.str_regex_automata_length_attempt_threshold();
 }
--- a/src/smt/params/theory_str_params.h
+++ b/src/smt/params/theory_str_params.h
@ -80,6 +80,43 @@ struct theory_str_params {

    double m_OverlapTheoryAwarePriority;

+    /*
+     * If RegexAutomata is set to true,
+     * Z3str3 will use automata-based methods to reason about
+     * regular expression constraints.
+     */
+    bool m_RegexAutomata;
+
+    /*
+     * RegexAutomata_DifficultyThreshold is the lowest difficulty above which Z3str3
+     * will not eagerly construct an automaton for a regular expression term.
+     */
+    unsigned m_RegexAutomata_DifficultyThreshold;
+
+    /*
+     * RegexAutomata_IntersectionDifficultyThreshold is the lowest difficulty above which Z3str3
+     * will not eagerly intersect automata to check unsatisfiability.
+     */
+    unsigned m_RegexAutomata_IntersectionDifficultyThreshold;
+
+    /*
+     * RegexAutomata_FailedAutomatonThreshold is the number of failed attempts to build an automaton
+     * after which a full automaton (i.e. with no length information) will be built regardless of difficulty.
+     */
+    unsigned m_RegexAutomata_FailedAutomatonThreshold;
+
+    /*
+     * RegexAutomaton_FailedIntersectionThreshold is the number of failed attempts to perform automaton
+     * intersection after which intersection will always be performed regardless of difficulty.
+     */
+    unsigned m_RegexAutomata_FailedIntersectionThreshold;
+
+    /*
+     * RegexAutomaton_LengthAttemptThreshold is the number of attempts to satisfy length/path constraints
+     * before which we begin checking unsatisfiability of a regex term.
+     */
+    unsigned m_RegexAutomata_LengthAttemptThreshold;
+
    theory_str_params(params_ref const & p = params_ref()):
        m_StrongArrangements(true),
        m_AggressiveLengthTesting(false),
@ -91,7 +128,13 @@ struct theory_str_params {
        m_FiniteOverlapModels(false),
        m_UseBinarySearch(false),
        m_BinarySearchInitialUpperBound(64),
-        m_OverlapTheoryAwarePriority(-0.1)
+        m_OverlapTheoryAwarePriority(-0.1),
+        m_RegexAutomata(true),
+        m_RegexAutomata_DifficultyThreshold(1000),
+        m_RegexAutomata_IntersectionDifficultyThreshold(1000),
+        m_RegexAutomata_FailedAutomatonThreshold(10),
+        m_RegexAutomata_FailedIntersectionThreshold(10),
+        m_RegexAutomata_LengthAttemptThreshold(10)
    {
        updt_params(p);
    }
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@ -20,9 +20,11 @@
 #include "util/trail.h"
 #include "util/union_find.h"
 #include "util/scoped_ptr_vector.h"
+#include "util/hashtable.h"
 #include "ast/ast_pp.h"
 #include "ast/arith_decl_plugin.h"
 #include "ast/rewriter/th_rewriter.h"
+#include "ast/rewriter/seq_rewriter.h"
 #include "ast/seq_decl_plugin.h"
 #include "smt/smt_theory.h"
 #include "smt/params/theory_str_params.h"
@ -36,6 +38,7 @@
 namespace smt {

 typedef hashtable<symbol, symbol_hash_proc, symbol_eq_proc> symbol_set;
+typedef int_hashtable<int_hash, default_eq<int> > integer_set;

 class str_value_factory : public value_factory {
    seq_util u;
@ -148,6 +151,70 @@ public:
    bool matches(zstring input);
 };

+class regex_automaton_under_assumptions {
+protected:
+    expr * re_term;
+    eautomaton * aut;
+    bool polarity;
+
+    bool assume_lower_bound;
+    rational lower_bound;
+
+    bool assume_upper_bound;
+    rational upper_bound;
+public:
+    regex_automaton_under_assumptions() :
+        re_term(NULL), aut(NULL), polarity(false),
+        assume_lower_bound(false), assume_upper_bound(false) {}
+
+    regex_automaton_under_assumptions(expr * re_term, eautomaton * aut, bool polarity) :
+        re_term(re_term), aut(aut), polarity(polarity),
+        assume_lower_bound(false), assume_upper_bound(false) {}
+
+    void set_lower_bound(rational & lb) {
+        lower_bound = lb;
+        assume_lower_bound = true;
+    }
+    void unset_lower_bound() {
+        assume_lower_bound = false;
+    }
+
+    void set_upper_bound(rational & ub) {
+        upper_bound = ub;
+        assume_upper_bound = true;
+    }
+    void unset_upper_bound() {
+        assume_upper_bound = false;
+    }
+
+    bool get_lower_bound(rational & lb) const {
+        if (assume_lower_bound) {
+            lb = lower_bound;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    bool get_upper_bound(rational & ub) const {
+        if (assume_upper_bound) {
+            ub = upper_bound;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    eautomaton * get_automaton() const { return aut; }
+    expr * get_regex_term() const { return re_term; }
+    bool get_polarity() const { return polarity; }
+
+    virtual ~regex_automaton_under_assumptions() {
+        // don't free str_in_re or aut;
+        // they are managed separately
+    }
+};
+
 class theory_str : public theory {
    struct T_cut
    {
@ -250,6 +317,8 @@ protected:

    str_value_factory * m_factory;

+    re2automaton m_mk_aut;
+
    // Unique identifier appended to unused variables to ensure that model construction
    // does not introduce equalities when they weren't enforced.
    unsigned m_unused_id;
@ -267,6 +336,10 @@ protected:
    // enode lists for library-aware/high-level string terms (e.g. substr, contains)
    ptr_vector<enode> m_library_aware_axiom_todo;

+    // list of axioms that are re-asserted every time the scope is popped
+    expr_ref_vector m_persisted_axioms;
+    expr_ref_vector m_persisted_axiom_todo;
+
    // hashtable of all exprs for which we've already set up term-specific axioms --
    // this prevents infinite recursive descent with respect to axioms that
    // include an occurrence of the term for which axioms are being generated
@ -320,7 +393,31 @@ protected:
    // TBD: do a curried map for determinism.
    std::map<std::pair<expr*, zstring>, expr*> regex_in_bool_map;
    obj_map<expr, std::set<zstring> > regex_in_var_reg_str_map;
+
+    // regex automata
+    scoped_ptr_vector<eautomaton> m_automata;
+    ptr_vector<eautomaton> regex_automata;
+    obj_hashtable<expr> regex_terms;
+    obj_map<expr, ptr_vector<expr> > regex_terms_by_string; // S --> [ (str.in.re S *) ]
+    obj_map<expr, svector<regex_automaton_under_assumptions> > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ]
    obj_map<expr, nfa> regex_nfa_cache; // Regex term --> NFA
+    obj_hashtable<expr> regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope
+    obj_hashtable<expr> regex_terms_with_length_constraints; // set of regex terms which had had length constraints asserted in the current scope
+    obj_map<expr, expr*> regex_term_to_length_constraint; // (str.in.re S R) -> (length constraint over S wrt. R)
+    obj_map<expr, ptr_vector<expr> > regex_term_to_extra_length_vars; // extra length vars used in regex_term_to_length_constraint entries
+
+    // keep track of the last lower/upper bound we saw for each string term
+    // so we don't perform duplicate work
+    obj_map<expr, rational> regex_last_lower_bound;
+    obj_map<expr, rational> regex_last_upper_bound;
+
+    // each counter maps a (str.in.re) expression to an integer.
+    // use helper functions regex_inc_counter() and regex_get_counter() to access
+    obj_map<expr, unsigned> regex_length_attempt_count;
+    obj_map<expr, unsigned> regex_fail_count;
+    obj_map<expr, unsigned> regex_intersection_fail_count;
+
+    obj_map<expr, ptr_vector<expr> > string_chars; // S --> [S_0, S_1, ...] for character terms S_i

    svector<char> char_set;
    std::map<char, int>  charSetLookupTable;
@ -439,14 +536,32 @@ protected:
    void instantiate_axiom_str_to_int(enode * e);
    void instantiate_axiom_int_to_str(enode * e);

+    void add_persisted_axiom(expr * a);
+
    expr * mk_RegexIn(expr * str, expr * regexp);
    void instantiate_axiom_RegexIn(enode * e);
    app * mk_unroll(expr * n, expr * bound);
-
    void process_unroll_eq_const_str(expr * unrollFunc, expr * constStr);
    void unroll_str2reg_constStr(expr * unrollFunc, expr * eqConstStr);
    void process_concat_eq_unroll(expr * concat, expr * unroll);

+    // regex automata and length-aware regex
+    unsigned estimate_regex_complexity(expr * re);
+    unsigned estimate_regex_complexity_under_complement(expr * re);
+    unsigned estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2);
+    bool check_regex_length_linearity(expr * re);
+    bool check_regex_length_linearity_helper(expr * re, bool already_star);
+    expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
+    void check_subterm_lengths(expr * re, integer_set & lens);
+    void find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut);
+    bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
+    bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
+    expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints);
+    void aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond);
+    expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var);
+    void regex_inc_counter(obj_map<expr, unsigned> & counter_map, expr * key);
+    unsigned regex_get_counter(obj_map<expr, unsigned> & counter_map, expr * key);
+
    void set_up_axioms(expr * ex);
    void handle_equality(expr * lhs, expr * rhs);

@ -535,6 +650,7 @@ protected:
            std::map<expr*, std::map<expr*, int> > & concat_eq_concat_map,
            std::map<expr*, std::set<expr*> > & unrollGroupMap);

+    bool term_appears_as_subterm(expr * needle, expr * haystack);
    void classify_ast_by_type(expr * node, std::map<expr*, int> & varMap,
            std::map<expr*, int> & concatMap, std::map<expr*, int> & unrollMap);
    void classify_ast_by_type_in_positive_context(std::map<expr*, int> & varMap,
@ -623,6 +739,7 @@ protected:
    void new_diseq_eh(theory_var, theory_var) override;

    theory* mk_fresh(context*) override { return alloc(theory_str, get_manager(), m_params); }
+    void init(context * ctx) override;
    void init_search_eh() override;
    void add_theory_assumptions(expr_ref_vector & assumptions) override;
    lbool validate_unsat_core(expr_ref_vector & unsat_core) override;