3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2025-04-08 10:25:18 +00:00

add basic regex functions

Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>
This commit is contained in:
Nikolaj Bjorner 2024-11-25 17:34:49 -08:00
parent b143a954c5
commit fce377e1d7
2 changed files with 167 additions and 26 deletions

View file

@ -74,11 +74,16 @@ Revert bias on long strings:
- bake in bias for shorter strings into equation solving?
Equality solving using stochastic Nelson.
- When solving for an equality w = v, first convert them into two vectors by removing concatenations.
The updates are then performed on the arguments to concatenations and not the concatenations themselves.
This saves some amount of spurious work when pushing assignments down over concatenations, which is
what the current first version of the solver does.
- Given equality where current assignment does not satisfy it:
- Xw = v:
- let X' range over prefixes of X that matches v.
- non-deterministic set X <- strval0(X')
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a' matches prefix of strval0(v), and X' is longest prefix of X that matches v.
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a'
matches prefix of strval0(v), and X' is longest prefix of X that matches v.
- If X fully matches a prefix of v, then, in addition to the rules above:
- consume constant character from strval0(X)w = v
- reveal the next variable to solve for.
@ -90,6 +95,8 @@ Equality solving using stochastic Nelson.
#include "ast/sls/sls_seq_plugin.h"
#include "ast/sls/sls_context.h"
#include "ast/ast_pp.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/rewriter/th_rewriter.h"
namespace sls {
@ -258,7 +265,6 @@ namespace sls {
VERIFY(seq.str.is_contains(e, a, b));
if (seq.is_string(a->get_sort()))
return strval0(a).contains(strval0(b));
NOT_IMPLEMENTED_YET();
break;
case OP_SEQ_PREFIX:
@ -274,6 +280,11 @@ namespace sls {
NOT_IMPLEMENTED_YET();
break;
case OP_SEQ_IN_RE:
VERIFY(seq.str.is_in_re(e, a, b));
if (seq.is_string(a->get_sort()))
return is_in_re(strval0(a), b);
NOT_IMPLEMENTED_YET();
break;
case OP_SEQ_NTH:
case OP_SEQ_NTH_I:
case OP_SEQ_NTH_U:
@ -420,35 +431,24 @@ namespace sls {
}
void seq_plugin::repair_up(app* e) {
if (m.is_bool(e))
return;
if (seq.str.is_itos(e)) {
if (is_value(e))
return;
if (seq.str.is_itos(e))
repair_up_str_itos(e);
return;
}
if (seq.str.is_stoi(e)) {
else if (seq.str.is_stoi(e))
repair_up_str_stoi(e);
return;
}
if (seq.str.is_length(e)) {
else if (seq.str.is_length(e))
repair_up_str_length(e);
return;
}
if (seq.str.is_index(e)) {
else if (seq.str.is_index(e))
repair_up_str_indexof(e);
return;
}
if (seq.is_string(e->get_sort())) {
if (is_value(e))
return;
else if (seq.is_string(e->get_sort())) {
strval0(e) = strval1(e);
ctx.new_value_eh(e);
return;
}
verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
else
verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
}
bool seq_plugin::repair_down(app* e) {
@ -461,6 +461,7 @@ namespace sls {
if (m.is_eq(e))
return repair_down_eq(e);
NOT_IMPLEMENTED_YET();
return false;
}
@ -621,6 +622,10 @@ namespace sls {
return repair_down_str_itos(e);
case OP_STRING_STOI:
return repair_down_str_stoi(e);
case OP_SEQ_IN_RE:
if (seq.is_string(to_app(e)->get_arg(0)->get_sort()))
return repair_down_in_re(e);
break;
case OP_STRING_UBVTOS:
case OP_STRING_SBVTOS:
case OP_STRING_TO_CODE:
@ -639,8 +644,6 @@ namespace sls {
case OP_SEQ_FOLDLI:
case OP_SEQ_TO_RE:
case OP_SEQ_IN_RE:
case OP_RE_PLUS:
case OP_RE_STAR:
case OP_RE_OPTION:
@ -679,7 +682,6 @@ namespace sls {
m_int_updates.push_back({ x, r, 1 });
else
m_int_updates.push_back({ x, rational(-1 - ctx.rand(10)), 1 });
return apply_update();
}
@ -1137,4 +1139,132 @@ namespace sls {
return get_eval(e).is_value;
return m.is_value(e);
}
// Regular expressions
bool seq_plugin::is_in_re(zstring const& s, expr* r) {
expr_ref sval(seq.str.mk_string(s), m);
th_rewriter rw(m);
expr_ref in_re(seq.re.mk_in_re(sval, r), m);
rw(in_re);
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
return m.is_true(in_re);
}
bool seq_plugin::repair_down_in_re(app* e) {
expr* x, * y;
VERIFY(seq.str.is_in_re(e, x, y));
auto info = seq.re.get_info(y);
if (!info.interpreted)
return false;
auto s = strval0(x);
expr_ref xval(seq.str.mk_string(s), m);
expr_ref in_re(seq.re.mk_in_re(xval, y), m);
th_rewriter rw(m);
rw(in_re);
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
if (m.is_true(in_re) == ctx.is_true(e))
return true;
if (is_value(x))
return false;
vector<zstring> conts;
expr_ref d_r(y, m);
seq_rewriter seqrw(m);
for (unsigned i = 0; i < s.length(); ++i) {
verbose_stream() << "Derivative " << s.extract(0, i) << ": " << d_r << "\n";
if (seq.re.is_empty(d_r))
break;
zstring prefix = s.extract(0, i);
choose(d_r, 2, prefix, conts);
expr_ref ch(seq.str.mk_char(s[i]), m);
d_r = seqrw.mk_derivative(ch, d_r);
}
if (!seq.re.is_empty(d_r))
choose(d_r, 2, s, conts);
verbose_stream() << "repair in_re " << mk_pp(e, m) << " " << s << "\n";
for (auto& str : conts)
verbose_stream() << "prefix " << str << "\n";
// TODO: do some length analysis to prune out short candidates when there are longer ones.
// TODO: when matching .*"bcd" with string ab, the extension abc is more interesting than aba.
if (ctx.is_true(e)) {
for (auto& str : conts)
m_str_updates.push_back({ x, str, 1 });
}
else {
for (auto& str : conts)
m_str_updates.push_back({ x, str + m_chars[ctx.rand(m_chars.size())], 1});
}
return apply_update();
}
void seq_plugin::next_char(expr* r, unsigned_vector& chars) {
SASSERT(seq.is_re(r));
expr* x, * y;
zstring s;
if (seq.re.is_concat(r, x, y)) {
auto info = seq.re.get_info(x);
next_char(x, chars);
if (info.nullable == l_true)
next_char(y, chars);
}
else if (seq.re.is_to_re(r, x)) {
if (seq.str.is_string(x, s) && !s.empty())
chars.push_back(s[0]);
}
else if (seq.re.is_union(r, x, y)) {
next_char(x, chars);
next_char(y, chars);
}
else if (seq.re.is_range(r, x, y)) {
zstring s1, s2;
seq.str.is_string(x, s1);
seq.str.is_string(y, s2);
if (s1.length() == 1 && s2.length() == 1 && s1[0] <= s2[0]) {
chars.push_back(s1[0] + ctx.rand(s2[0] - s1[0] + 1));
chars.push_back(s1[0]);
chars.push_back(s2[0]);
}
}
else if (seq.re.is_star(r, x) || seq.re.is_plus(r, x)) {
next_char(x, chars);
}
else if (seq.re.is_empty(r)) {
;
}
else if (seq.re.is_full_seq(r)) {
if (!m_chars.empty())
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
}
else if (seq.re.is_full_char(r)) {
if (!m_chars.empty())
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
}
else {
verbose_stream() << "regex nyi " << mk_bounded_pp(r, m) << "\n";
NOT_IMPLEMENTED_YET();
}
}
void seq_plugin::choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result) {
auto info = seq.re.get_info(r);
result.push_back(prefix);
if (k == 0)
return;
unsigned_vector chars;
next_char(r, chars);
std::stable_sort(chars.begin(), chars.end());
auto it = std::unique(chars.begin(), chars.end());
chars.shrink((unsigned)(it - chars.begin()));
for (auto ch : chars) {
expr_ref c(seq.str.mk_char(ch), m);
seq_rewriter rw(m);
expr_ref r2 = rw.mk_derivative(c, r);
zstring prefix2 = prefix + zstring(ch);
choose(r2, k - 1, prefix2, result);
}
}
}

View file

@ -44,7 +44,7 @@ namespace sls {
arith_util a;
scoped_ptr_vector<eval> m_values;
indexed_uint_set m_chars;
bool m_initialized = false;
bool m_initialized = false;
struct str_update {
expr* e;
@ -81,13 +81,24 @@ namespace sls {
bool repair_down_str_suffixof(app* e);
bool repair_down_str_itos(app* e);
bool repair_down_str_stoi(app* e);
bool repair_down_in_re(app* e);
void repair_up_str_length(app* e);
void repair_up_str_indexof(app* e);
void repair_up_str_itos(app* e);
void repair_up_str_stoi(app* e);
// regex functionality
// enumerate set of strings that can match a prefix of regex r.
void choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result);
// enumerate set of possible next chars, including possibly sampling from m_chars for whild-cards.
void next_char(expr* r, unsigned_vector& chars);
bool is_in_re(zstring const& s, expr* r);
// access evaluation
bool is_seq_predicate(expr* e);
eval& get_eval(expr* e);