mirror of
https://github.com/Z3Prover/z3
synced 2025-04-08 10:25:18 +00:00
add basic regex functions
Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>
This commit is contained in:
parent
b143a954c5
commit
fce377e1d7
|
@ -74,11 +74,16 @@ Revert bias on long strings:
|
|||
- bake in bias for shorter strings into equation solving?
|
||||
|
||||
Equality solving using stochastic Nelson.
|
||||
- When solving for an equality w = v, first convert them into two vectors by removing concatenations.
|
||||
The updates are then performed on the arguments to concatenations and not the concatenations themselves.
|
||||
This saves some amount of spurious work when pushing assignments down over concatenations, which is
|
||||
what the current first version of the solver does.
|
||||
- Given equality where current assignment does not satisfy it:
|
||||
- Xw = v:
|
||||
- let X' range over prefixes of X that matches v.
|
||||
- non-deterministic set X <- strval0(X')
|
||||
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a' matches prefix of strval0(v), and X' is longest prefix of X that matches v.
|
||||
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a'
|
||||
matches prefix of strval0(v), and X' is longest prefix of X that matches v.
|
||||
- If X fully matches a prefix of v, then, in addition to the rules above:
|
||||
- consume constant character from strval0(X)w = v
|
||||
- reveal the next variable to solve for.
|
||||
|
@ -90,6 +95,8 @@ Equality solving using stochastic Nelson.
|
|||
#include "ast/sls/sls_seq_plugin.h"
|
||||
#include "ast/sls/sls_context.h"
|
||||
#include "ast/ast_pp.h"
|
||||
#include "ast/rewriter/seq_rewriter.h"
|
||||
#include "ast/rewriter/th_rewriter.h"
|
||||
|
||||
|
||||
namespace sls {
|
||||
|
@ -258,7 +265,6 @@ namespace sls {
|
|||
VERIFY(seq.str.is_contains(e, a, b));
|
||||
if (seq.is_string(a->get_sort()))
|
||||
return strval0(a).contains(strval0(b));
|
||||
|
||||
NOT_IMPLEMENTED_YET();
|
||||
break;
|
||||
case OP_SEQ_PREFIX:
|
||||
|
@ -274,6 +280,11 @@ namespace sls {
|
|||
NOT_IMPLEMENTED_YET();
|
||||
break;
|
||||
case OP_SEQ_IN_RE:
|
||||
VERIFY(seq.str.is_in_re(e, a, b));
|
||||
if (seq.is_string(a->get_sort()))
|
||||
return is_in_re(strval0(a), b);
|
||||
NOT_IMPLEMENTED_YET();
|
||||
break;
|
||||
case OP_SEQ_NTH:
|
||||
case OP_SEQ_NTH_I:
|
||||
case OP_SEQ_NTH_U:
|
||||
|
@ -420,35 +431,24 @@ namespace sls {
|
|||
}
|
||||
|
||||
void seq_plugin::repair_up(app* e) {
|
||||
|
||||
if (m.is_bool(e))
|
||||
return;
|
||||
|
||||
if (seq.str.is_itos(e)) {
|
||||
if (is_value(e))
|
||||
return;
|
||||
if (seq.str.is_itos(e))
|
||||
repair_up_str_itos(e);
|
||||
return;
|
||||
}
|
||||
if (seq.str.is_stoi(e)) {
|
||||
else if (seq.str.is_stoi(e))
|
||||
repair_up_str_stoi(e);
|
||||
return;
|
||||
}
|
||||
if (seq.str.is_length(e)) {
|
||||
else if (seq.str.is_length(e))
|
||||
repair_up_str_length(e);
|
||||
return;
|
||||
}
|
||||
if (seq.str.is_index(e)) {
|
||||
else if (seq.str.is_index(e))
|
||||
repair_up_str_indexof(e);
|
||||
return;
|
||||
}
|
||||
if (seq.is_string(e->get_sort())) {
|
||||
if (is_value(e))
|
||||
return;
|
||||
else if (seq.is_string(e->get_sort())) {
|
||||
strval0(e) = strval1(e);
|
||||
ctx.new_value_eh(e);
|
||||
return;
|
||||
}
|
||||
|
||||
verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
|
||||
else
|
||||
verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
|
||||
}
|
||||
|
||||
bool seq_plugin::repair_down(app* e) {
|
||||
|
@ -461,6 +461,7 @@ namespace sls {
|
|||
if (m.is_eq(e))
|
||||
return repair_down_eq(e);
|
||||
|
||||
|
||||
NOT_IMPLEMENTED_YET();
|
||||
return false;
|
||||
}
|
||||
|
@ -621,6 +622,10 @@ namespace sls {
|
|||
return repair_down_str_itos(e);
|
||||
case OP_STRING_STOI:
|
||||
return repair_down_str_stoi(e);
|
||||
case OP_SEQ_IN_RE:
|
||||
if (seq.is_string(to_app(e)->get_arg(0)->get_sort()))
|
||||
return repair_down_in_re(e);
|
||||
break;
|
||||
case OP_STRING_UBVTOS:
|
||||
case OP_STRING_SBVTOS:
|
||||
case OP_STRING_TO_CODE:
|
||||
|
@ -639,8 +644,6 @@ namespace sls {
|
|||
case OP_SEQ_FOLDLI:
|
||||
|
||||
case OP_SEQ_TO_RE:
|
||||
case OP_SEQ_IN_RE:
|
||||
|
||||
case OP_RE_PLUS:
|
||||
case OP_RE_STAR:
|
||||
case OP_RE_OPTION:
|
||||
|
@ -679,7 +682,6 @@ namespace sls {
|
|||
m_int_updates.push_back({ x, r, 1 });
|
||||
else
|
||||
m_int_updates.push_back({ x, rational(-1 - ctx.rand(10)), 1 });
|
||||
|
||||
return apply_update();
|
||||
}
|
||||
|
||||
|
@ -1137,4 +1139,132 @@ namespace sls {
|
|||
return get_eval(e).is_value;
|
||||
return m.is_value(e);
|
||||
}
|
||||
|
||||
// Regular expressions
|
||||
|
||||
bool seq_plugin::is_in_re(zstring const& s, expr* r) {
|
||||
expr_ref sval(seq.str.mk_string(s), m);
|
||||
th_rewriter rw(m);
|
||||
expr_ref in_re(seq.re.mk_in_re(sval, r), m);
|
||||
rw(in_re);
|
||||
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
|
||||
return m.is_true(in_re);
|
||||
}
|
||||
|
||||
bool seq_plugin::repair_down_in_re(app* e) {
|
||||
expr* x, * y;
|
||||
VERIFY(seq.str.is_in_re(e, x, y));
|
||||
auto info = seq.re.get_info(y);
|
||||
if (!info.interpreted)
|
||||
return false;
|
||||
auto s = strval0(x);
|
||||
expr_ref xval(seq.str.mk_string(s), m);
|
||||
expr_ref in_re(seq.re.mk_in_re(xval, y), m);
|
||||
th_rewriter rw(m);
|
||||
rw(in_re);
|
||||
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
|
||||
if (m.is_true(in_re) == ctx.is_true(e))
|
||||
return true;
|
||||
|
||||
if (is_value(x))
|
||||
return false;
|
||||
|
||||
vector<zstring> conts;
|
||||
expr_ref d_r(y, m);
|
||||
seq_rewriter seqrw(m);
|
||||
for (unsigned i = 0; i < s.length(); ++i) {
|
||||
verbose_stream() << "Derivative " << s.extract(0, i) << ": " << d_r << "\n";
|
||||
if (seq.re.is_empty(d_r))
|
||||
break;
|
||||
zstring prefix = s.extract(0, i);
|
||||
choose(d_r, 2, prefix, conts);
|
||||
expr_ref ch(seq.str.mk_char(s[i]), m);
|
||||
d_r = seqrw.mk_derivative(ch, d_r);
|
||||
}
|
||||
if (!seq.re.is_empty(d_r))
|
||||
choose(d_r, 2, s, conts);
|
||||
|
||||
verbose_stream() << "repair in_re " << mk_pp(e, m) << " " << s << "\n";
|
||||
for (auto& str : conts)
|
||||
verbose_stream() << "prefix " << str << "\n";
|
||||
|
||||
// TODO: do some length analysis to prune out short candidates when there are longer ones.
|
||||
// TODO: when matching .*"bcd" with string ab, the extension abc is more interesting than aba.
|
||||
if (ctx.is_true(e)) {
|
||||
for (auto& str : conts)
|
||||
m_str_updates.push_back({ x, str, 1 });
|
||||
}
|
||||
else {
|
||||
for (auto& str : conts)
|
||||
m_str_updates.push_back({ x, str + m_chars[ctx.rand(m_chars.size())], 1});
|
||||
}
|
||||
return apply_update();
|
||||
}
|
||||
|
||||
void seq_plugin::next_char(expr* r, unsigned_vector& chars) {
|
||||
SASSERT(seq.is_re(r));
|
||||
expr* x, * y;
|
||||
zstring s;
|
||||
if (seq.re.is_concat(r, x, y)) {
|
||||
auto info = seq.re.get_info(x);
|
||||
next_char(x, chars);
|
||||
if (info.nullable == l_true)
|
||||
next_char(y, chars);
|
||||
}
|
||||
else if (seq.re.is_to_re(r, x)) {
|
||||
if (seq.str.is_string(x, s) && !s.empty())
|
||||
chars.push_back(s[0]);
|
||||
}
|
||||
else if (seq.re.is_union(r, x, y)) {
|
||||
next_char(x, chars);
|
||||
next_char(y, chars);
|
||||
}
|
||||
else if (seq.re.is_range(r, x, y)) {
|
||||
zstring s1, s2;
|
||||
seq.str.is_string(x, s1);
|
||||
seq.str.is_string(y, s2);
|
||||
if (s1.length() == 1 && s2.length() == 1 && s1[0] <= s2[0]) {
|
||||
chars.push_back(s1[0] + ctx.rand(s2[0] - s1[0] + 1));
|
||||
chars.push_back(s1[0]);
|
||||
chars.push_back(s2[0]);
|
||||
}
|
||||
}
|
||||
else if (seq.re.is_star(r, x) || seq.re.is_plus(r, x)) {
|
||||
next_char(x, chars);
|
||||
}
|
||||
else if (seq.re.is_empty(r)) {
|
||||
;
|
||||
}
|
||||
else if (seq.re.is_full_seq(r)) {
|
||||
if (!m_chars.empty())
|
||||
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
|
||||
}
|
||||
else if (seq.re.is_full_char(r)) {
|
||||
if (!m_chars.empty())
|
||||
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
|
||||
}
|
||||
else {
|
||||
verbose_stream() << "regex nyi " << mk_bounded_pp(r, m) << "\n";
|
||||
NOT_IMPLEMENTED_YET();
|
||||
}
|
||||
}
|
||||
|
||||
void seq_plugin::choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result) {
|
||||
auto info = seq.re.get_info(r);
|
||||
result.push_back(prefix);
|
||||
if (k == 0)
|
||||
return;
|
||||
unsigned_vector chars;
|
||||
next_char(r, chars);
|
||||
std::stable_sort(chars.begin(), chars.end());
|
||||
auto it = std::unique(chars.begin(), chars.end());
|
||||
chars.shrink((unsigned)(it - chars.begin()));
|
||||
for (auto ch : chars) {
|
||||
expr_ref c(seq.str.mk_char(ch), m);
|
||||
seq_rewriter rw(m);
|
||||
expr_ref r2 = rw.mk_derivative(c, r);
|
||||
zstring prefix2 = prefix + zstring(ch);
|
||||
choose(r2, k - 1, prefix2, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ namespace sls {
|
|||
arith_util a;
|
||||
scoped_ptr_vector<eval> m_values;
|
||||
indexed_uint_set m_chars;
|
||||
bool m_initialized = false;
|
||||
bool m_initialized = false;
|
||||
|
||||
struct str_update {
|
||||
expr* e;
|
||||
|
@ -81,13 +81,24 @@ namespace sls {
|
|||
bool repair_down_str_suffixof(app* e);
|
||||
bool repair_down_str_itos(app* e);
|
||||
bool repair_down_str_stoi(app* e);
|
||||
bool repair_down_in_re(app* e);
|
||||
|
||||
void repair_up_str_length(app* e);
|
||||
void repair_up_str_indexof(app* e);
|
||||
void repair_up_str_itos(app* e);
|
||||
void repair_up_str_stoi(app* e);
|
||||
|
||||
// regex functionality
|
||||
|
||||
// enumerate set of strings that can match a prefix of regex r.
|
||||
void choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result);
|
||||
|
||||
// enumerate set of possible next chars, including possibly sampling from m_chars for whild-cards.
|
||||
void next_char(expr* r, unsigned_vector& chars);
|
||||
|
||||
bool is_in_re(zstring const& s, expr* r);
|
||||
|
||||
// access evaluation
|
||||
bool is_seq_predicate(expr* e);
|
||||
|
||||
eval& get_eval(expr* e);
|
||||
|
|
Loading…
Reference in a new issue