From 8b077ebbe799f0be78679f537167103e34904c92 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Tue, 28 Feb 2017 14:06:13 -0500 Subject: [PATCH] re-add regex NFA --- src/smt/theory_str.cpp | 161 ++++++++++++++++++++++++++++++++++++++--- src/smt/theory_str.h | 48 +++++++++++- 2 files changed, 199 insertions(+), 10 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 4440d6462..d81bb9471 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -598,7 +598,7 @@ app * theory_str::mk_unroll(expr * n, expr * bound) { ast_manager & m = get_manager(); expr * args[2] = {n, bound}; - app * unrollFunc = get_manager().mk_app(get_id(), OP_RE_UNROLL, 0, 0, 2, args); + app * unrollFunc = get_manager().mk_app(get_id(), _OP_RE_UNROLL, 0, 0, 2, args); m_trail.push_back(unrollFunc); expr_ref_vector items(m); @@ -4428,7 +4428,7 @@ void theory_str::process_concat_eq_type6(expr * concatAst1, expr * concatAst2) { void theory_str::process_unroll_eq_const_str(expr * unrollFunc, expr * constStr) { ast_manager & m = get_manager(); - if (!is_Unroll(to_app(unrollFunc))) { + if (!u.re.is_unroll(to_app(unrollFunc))) { return; } if (!u.str.is_string(constStr)) { @@ -5444,7 +5444,7 @@ void theory_str::get_grounded_concats(expr* node, std::map & varAl std::map & concatAliasMap, std::map & varConstMap, std::map & concatConstMap, std::map > & varEqConcatMap, std::map, std::set > > & groundedMap) { - if (is_Unroll(to_app(node))) { + if (u.re.is_unroll(to_app(node))) { return; } // ************************************************** @@ -6129,6 +6129,149 @@ bool theory_str::check_concat_len_in_eqc(expr * concat) { return no_assertions; } +// Convert a regular expression to an e-NFA using Thompson's construction +void nfa::convert_re(expr * e, unsigned & start, unsigned & end, seq_util & u) { + start = next_id(); + end = next_id(); + if (u.re.is_to_re(e)) { + app * a = to_app(e); + expr * arg_str = a->get_arg(0); + zstring str; + if (u.str.is_string(arg_str, str)) { + TRACE("t_str_rw", tout << "build NFA for '" << str << "'" << "\n";); + + /* + * For an n-character string, we make (n-1) intermediate states, + * labelled i_(0) through i_(n-2). + * Then we construct the following transitions: + * start --str[0]--> i_(0) --str[1]--> i_(1) --...--> i_(n-2) --str[n-1]--> final + */ + unsigned last = start; + for (int i = 0; i <= ((int)str.length()) - 2; ++i) { + unsigned i_state = next_id(); + make_transition(last, str[i], i_state); + TRACE("t_str_rw", tout << "string transition " << last << "--" << str[i] << "--> " << i_state << "\n";); + last = i_state; + } + make_transition(last, str[(str.length() - 1)], end); + TRACE("t_str_rw", tout << "string transition " << last << "--" << str[(str.length() - 1)] << "--> " << end << "\n";); + TRACE("t_str_rw", tout << "string NFA: start = " << start << ", end = " << end << std::endl;); + } else { + TRACE("t_str_rw", tout << "invalid string constant in Str2Reg" << std::endl;); + m_valid = false; + return; + } + } else if (u.re.is_concat(e)){ + app * a = to_app(e); + expr * re1 = a->get_arg(0); + expr * re2 = a->get_arg(1); + unsigned start1, end1; + convert_re(re1, start1, end1, u); + unsigned start2, end2; + convert_re(re2, start2, end2, u); + // start --e--> start1 --...--> end1 --e--> start2 --...--> end2 --e--> end + make_epsilon_move(start, start1); + make_epsilon_move(end1, start2); + make_epsilon_move(end2, end); + TRACE("t_str_rw", tout << "concat NFA: start = " << start << ", end = " << end << std::endl;); + } else if (u.re.is_union(e)) { + app * a = to_app(e); + expr * re1 = a->get_arg(0); + expr * re2 = a->get_arg(1); + unsigned start1, end1; + convert_re(re1, start1, end1, u); + unsigned start2, end2; + convert_re(re2, start2, end2, u); + + // start --e--> start1 ; start --e--> start2 + // end1 --e--> end ; end2 --e--> end + make_epsilon_move(start, start1); + make_epsilon_move(start, start2); + make_epsilon_move(end1, end); + make_epsilon_move(end2, end); + TRACE("t_str_rw", tout << "union NFA: start = " << start << ", end = " << end << std::endl;); + } else if (u.re.is_star(e)) { + app * a = to_app(e); + expr * subex = a->get_arg(0); + unsigned start_subex, end_subex; + convert_re(subex, start_subex, end_subex, u); + // start --e--> start_subex, start --e--> end + // end_subex --e--> start_subex, end_subex --e--> end + make_epsilon_move(start, start_subex); + make_epsilon_move(start, end); + make_epsilon_move(end_subex, start_subex); + make_epsilon_move(end_subex, end); + TRACE("t_str_rw", tout << "star NFA: start = " << start << ", end = " << end << std::endl;); + } else { + TRACE("t_str_rw", tout << "invalid regular expression" << std::endl;); + m_valid = false; + return; + } +} + +void nfa::epsilon_closure(unsigned start, std::set & closure) { + std::deque worklist; + closure.insert(start); + worklist.push_back(start); + + while(!worklist.empty()) { + unsigned state = worklist.front(); + worklist.pop_front(); + if (epsilon_map.find(state) != epsilon_map.end()) { + for (std::set::iterator it = epsilon_map[state].begin(); + it != epsilon_map[state].end(); ++it) { + unsigned new_state = *it; + if (closure.find(new_state) == closure.end()) { + closure.insert(new_state); + worklist.push_back(new_state); + } + } + } + } +} + +bool nfa::matches(zstring input) { + /* + * Keep a set of all states the NFA can currently be in. + * Initially this is the e-closure of m_start_state + * For each character A in the input string, + * the set of next states contains + * all states in transition_map[S][A] for each S in current_states, + * and all states in epsilon_map[S] for each S in current_states. + * After consuming the entire input string, + * the match is successful iff current_states contains m_end_state. + */ + std::set current_states; + epsilon_closure(m_start_state, current_states); + for (unsigned i = 0; i < input.length(); ++i) { + char A = input.at(i); + std::set next_states; + for (std::set::iterator it = current_states.begin(); + it != current_states.end(); ++it) { + unsigned S = *it; + // check transition_map + if (transition_map[S].find(A) != transition_map[S].end()) { + next_states.insert(transition_map[S][A]); + } + } + + // take e-closure over next_states to compute the actual next_states + std::set epsilon_next_states; + for (std::set::iterator it = next_states.begin(); it != next_states.end(); ++it) { + unsigned S = *it; + std::set closure; + epsilon_closure(S, closure); + epsilon_next_states.insert(closure.begin(), closure.end()); + } + current_states = epsilon_next_states; + } + if (current_states.find(m_end_state) != current_states.end()) { + return true; + } else { + return false; + } +} + void theory_str::check_regex_in(expr * nn1, expr * nn2) { context & ctx = get_context(); ast_manager & m = get_manager(); @@ -6159,7 +6302,7 @@ void theory_str::check_regex_in(expr * nn1, expr * nn2) { // TODO figure out regex NFA stuff if (regex_nfa_cache.find(regexTerm) == regex_nfa_cache.end()) { TRACE("t_str_detail", tout << "regex_nfa_cache: cache miss" << std::endl;); - regex_nfa_cache[regexTerm] = nfa(m_strutil, regexTerm); + regex_nfa_cache[regexTerm] = nfa(u, regexTerm); } else { TRACE("t_str_detail", tout << "regex_nfa_cache: cache hit" << std::endl;); } @@ -7385,7 +7528,7 @@ void theory_str::classify_ast_by_type(expr * node, std::map & varMap if (canskip == 0 && concatMap.find(node) == concatMap.end()) { concatMap[node] = 1; } - } else if (is_Unroll(aNode)) { + } else if (u.re.is_unroll(aNode)) { // Unroll if (unrollMap.find(node) == unrollMap.end()) { unrollMap[node] = 1; @@ -7658,7 +7801,7 @@ int theory_str::ctx_dep_analysis(std::map & strVarMap, std::mapget_owner(); - if (is_Unroll(curr)) { + if (u.re.is_unroll(curr)) { if (aRoot == NULL) { aRoot = curr; } @@ -7753,7 +7896,7 @@ int theory_str::ctx_dep_analysis(std::map & strVarMap, std::map & do { if (u.str.is_string(to_app(curr))) { constStr = curr; - } else if (is_Unroll(to_app(curr))) { + } else if (u.re.is_unroll(to_app(curr))) { if (unrollFuncSet.find(curr) == unrollFuncSet.end()) { unrollFuncSet.insert(curr); } @@ -10210,7 +10353,7 @@ void theory_str::get_eqc_simpleUnroll(expr * n, expr * &constStr, std::setget_arg(0); if (u.re.is_to_re(to_app(core))) { if (unrollFuncSet.find(curr) == unrollFuncSet.end()) { diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 233b3b7f5..499bb23f8 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -110,6 +110,52 @@ namespace smt { } }; + + class nfa { + protected: + bool m_valid; + unsigned m_next_id; + + unsigned next_id() { + unsigned retval = m_next_id; + ++m_next_id; + return retval; + } + + unsigned m_start_state; + unsigned m_end_state; + + std::map > transition_map; + std::map > epsilon_map; + + void make_transition(unsigned start, char symbol, unsigned end) { + transition_map[start][symbol] = end; + } + + void make_epsilon_move(unsigned start, unsigned end) { + epsilon_map[start].insert(end); + } + + // Convert a regular expression to an e-NFA using Thompson's construction + void convert_re(expr * e, unsigned & start, unsigned & end, seq_util & u); + + public: + nfa(seq_util & u, expr * e) + : m_valid(true), m_next_id(0), m_start_state(0), m_end_state(0) { + convert_re(e, m_start_state, m_end_state, u); + } + + nfa() : m_valid(false), m_next_id(0), m_start_state(0), m_end_state(0) {} + + bool is_valid() const { + return m_valid; + } + + void epsilon_closure(unsigned start, std::set & closure); + + bool matches(zstring input); + }; + class theory_str : public theory { struct T_cut { @@ -274,7 +320,7 @@ namespace smt { std::map, expr*> regex_in_bool_map; std::map > regex_in_var_reg_str_map; - // std::map regex_nfa_cache; // Regex term --> NFA + std::map regex_nfa_cache; // Regex term --> NFA char * char_set; std::map charSetLookupTable;