From 2c33bd6fafeded0c2c9dbfc8b76e1b872cc44ad1 Mon Sep 17 00:00:00 2001
From: Margus Veanes <margus@microsoft.com>
Date: Thu, 13 Aug 2020 12:40:35 -0700
Subject: [PATCH] pp support for regex expressions is more-or-less standard
 syntax

---
 src/ast/seq_decl_plugin.cpp | 188 ++++++++++++++++++++++++++++++++++++
 src/ast/seq_decl_plugin.h   |   7 ++
 src/smt/seq_regex.cpp       |   9 +-
 3 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/src/ast/seq_decl_plugin.cpp b/src/ast/seq_decl_plugin.cpp
index afa3a023f..bf55b4154 100644
--- a/src/ast/seq_decl_plugin.cpp
+++ b/src/ast/seq_decl_plugin.cpp
@@ -1465,3 +1465,191 @@ bool seq_util::re::is_epsilon(expr* r) const {
 app* seq_util::re::mk_epsilon(sort* seq_sort) {
     return mk_to_re(u.str.mk_empty(seq_sort));
 }
+
+/*
+  Provides a standard pretty printed view of the regex r when possible.
+*/
+std::string seq_util::re::pp(expr* r) {
+    SASSERT(u.is_re(r));
+    std::ostringstream buffer;
+    pp_compact_to_buffer(buffer, r);
+    return buffer.str();
+}
+
+void seq_util::re::pp_compact_to_buffer(std::ostringstream& buffer, expr* r) {
+    SASSERT(u.is_re(r));
+    expr* r1 = nullptr, * r2 = nullptr, * s = nullptr, * s2 = nullptr;
+    unsigned lo = 0, hi = 0;
+    if (is_full_char(r))
+        buffer << ".";
+    else if (is_full_seq(r))
+        buffer << ".*";
+    else if (is_to_re(r, s))
+        pp_compact_helper_seq(buffer, s);
+    else if (is_range(r, s, s2))
+        pp_compact_helper_range(buffer, s, s2);
+    else if (is_epsilon(r))
+        buffer << "()";
+    else if (is_empty(r))
+        buffer << "[]";
+    else if (is_concat(r, r1, r2)) {
+        pp_compact_to_buffer(buffer, r1);
+        pp_compact_to_buffer(buffer, r2);
+    }
+    else if (is_union(r, r1, r2)) {
+        pp_compact_to_buffer(buffer, r1);
+        buffer << "|";
+        pp_compact_to_buffer(buffer, r2);
+    }
+    else if (is_intersection(r, r1, r2)) {
+        buffer << "(";
+        pp_compact_to_buffer(buffer, r1);
+        buffer << ")&(";
+        pp_compact_to_buffer(buffer, r2);
+        buffer << ")";
+    }
+    else if (is_complement(r, r1)) {
+        buffer << "~";
+        if (pp_can_skip_parenth(r1))
+            pp_compact_to_buffer(buffer, r1);
+        else {
+            buffer << "(";
+            pp_compact_to_buffer(buffer, r1);
+            buffer << ")";
+        }
+    }
+    else if (is_plus(r, r1))
+        if (pp_can_skip_parenth(r1)) {
+            pp_compact_to_buffer(buffer, r1);
+            buffer << "+";
+        }
+        else {
+            buffer << "(";
+            pp_compact_to_buffer(buffer, r1);
+            buffer << ")+";
+        }
+    else if (is_star(r, r1))
+        if (pp_can_skip_parenth(r1)) {
+            pp_compact_to_buffer(buffer, r1);
+            buffer << "*";
+        }
+        else {
+            buffer << "(";
+            pp_compact_to_buffer(buffer, r1);
+            buffer << ")*";
+        }
+    else if (is_loop(r, r1, lo))
+        if (pp_can_skip_parenth(r1))
+        {
+            pp_compact_to_buffer(buffer, r1);
+            buffer << "{" << std::to_string(lo) << ",}";
+        }
+        else {
+            buffer << "(";
+            pp_compact_to_buffer(buffer, r1);
+            buffer << "){" << std::to_string(lo) << ",}";
+        }
+    else if (is_loop(r, r1, lo, hi))
+        if (pp_can_skip_parenth(r1))
+        {
+            pp_compact_to_buffer(buffer, r1);
+            buffer << "{" << std::to_string(lo) << "," << std::to_string(hi) << "}";
+        }
+        else {
+            buffer << "(";
+            pp_compact_to_buffer(buffer, r1);
+            buffer << "){" << std::to_string(lo) << "," << std::to_string(hi) << "}";
+        }
+    else if (is_diff(r, r1, r2)) {
+        buffer << "(";
+        pp_compact_to_buffer(buffer, r1);
+        buffer << ")\\(";
+        pp_compact_to_buffer(buffer, r2);
+        buffer << ")";
+    }
+    else if (m.is_ite(r, s, r1, r2)) {
+        buffer << "if(" << mk_pp(s, m) << ",";
+        pp_compact_to_buffer(buffer, r1);
+        buffer << ",";
+        pp_compact_to_buffer(buffer, r2);
+        buffer << ")";
+    }
+    else if (is_opt(r, r1))
+        if (pp_can_skip_parenth(r1)) {
+            pp_compact_to_buffer(buffer, r1);
+            buffer << "?";
+        }
+        else {
+            buffer << "(";
+            pp_compact_to_buffer(buffer, r1);
+            buffer << ")?";
+        }
+    else if (is_reverse(r, r1)) {
+        buffer << "reverse(";
+        pp_compact_to_buffer(buffer, r1);
+        buffer << ")";
+    }
+    else
+        // Else: derivative or is_of_pred
+        buffer << mk_pp(r, m);
+}
+
+void seq_util::re::pp_compact_helper_seq(std::ostringstream& buffer, expr* s) {
+    SASSERT(u.is_seq(s));
+    if (m.is_value(s)) {
+        SASSERT(s->get_kind() == ast_kind::AST_APP);
+        if (u.str.is_concat(s)) {
+            expr_ref_vector es(m);
+            u.str.get_concat(s, es);
+            for (unsigned i = 0; i < es.size(); i++)
+                if (u.str.is_unit(es.get(i)))
+                    pp_seq_unit(buffer, es.get(i));
+                else
+                    buffer << mk_pp(es.get(i), m);
+        }
+        else
+            pp_seq_unit(buffer, s);
+    }
+    else
+        buffer << mk_pp(s, m);
+}
+
+void seq_util::re::pp_compact_helper_range(std::ostringstream& buffer, expr* s1, expr* s2) {
+    buffer << "[";
+    if (u.str.is_unit(s1))
+        pp_seq_unit(buffer, s1);
+    else 
+        buffer << mk_pp(s1, m);
+    buffer << "-";
+    if (u.str.is_unit(s2))
+        pp_seq_unit(buffer, s2);
+    else
+        buffer << mk_pp(s1, m);
+    buffer << "]";
+}
+
+bool seq_util::re::pp_can_skip_parenth(expr* r) {
+    expr* s;
+    return ((is_to_re(r, s) && u.str.is_unit(s)) || is_range(r));
+}
+
+void seq_util::re::pp_seq_unit(std::ostringstream& buffer, expr* s) {
+    expr* e;
+    if (u.str.is_unit(s, e)) {
+        rational r;
+        unsigned sz;
+        if (u.bv().is_numeral(e, r, sz) && sz == 8 && r.is_unsigned()) {
+            unsigned n = r.get_unsigned();
+            if (32 < n && n < 127)
+                buffer << (char)n;
+            else if (n < 10)
+                buffer << "\\x0" << std::hex << n;
+            else
+                buffer << "\\x" << std::hex << n;
+        }
+        else
+            buffer << mk_pp(s, m);
+    }
+    else
+        buffer << mk_pp(s, m);
+}
diff --git a/src/ast/seq_decl_plugin.h b/src/ast/seq_decl_plugin.h
index eb2feba7a..e4f55a7af 100644
--- a/src/ast/seq_decl_plugin.h
+++ b/src/ast/seq_decl_plugin.h
@@ -412,6 +412,12 @@ public:
         seq_util&    u;
         ast_manager& m;
         family_id    m_fid;
+        void seq_util::re::pp_compact_helper_seq(std::ostringstream& buffer, expr* s);
+        void seq_util::re::pp_compact_helper_range(std::ostringstream& buffer, expr* s1, expr* s2);
+        bool seq_util::re::pp_can_skip_parenth(expr* r);
+        void seq_util::re::pp_seq_unit(std::ostringstream& buffer, expr* s);
+        void pp_compact_to_buffer(std::ostringstream& buffer, expr* r);
+
     public:
         re(seq_util& u): u(u), m(u.m), m_fid(u.m_fid) {}
 
@@ -481,6 +487,7 @@ public:
         unsigned max_length(expr* r) const;
         bool is_epsilon(expr* r) const;
         app* mk_epsilon(sort* seq_sort);
+        std::string pp(expr* r);
     };
     str str;
     re  re;
diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp
index 06b5e7f89..99ddfbc5b 100644
--- a/src/smt/seq_regex.cpp
+++ b/src/smt/seq_regex.cpp
@@ -791,6 +791,10 @@ namespace smt {
             STRACE("seq_regex_brief", tout << "(MAX SIZE REACHED) ";);
             return false;
         }
+        STRACE("seq_regex", tout << "Updating state graph for regex "
+                                 << mk_pp(r, m) << ") ";);
+        if (!m_state_graph.is_seen(r_id))
+            STRACE("state_graph", tout << std::endl << "state(" << r_id << ") = " << re().pp(r) << std::endl;);
         // Add state
         m_state_graph.add_state(r_id);
         STRACE("state_graph", tout << "regex(" << r_id << ") = " << mk_pp(r, m) << std::endl;);
@@ -811,7 +815,10 @@ namespace smt {
             for (auto const& dr: derivatives) {
                 unsigned dr_id = get_state_id(dr);
                 STRACE("seq_regex_verbose", tout
-                    << "  traversing deriv: " << dr_id << " " << std::endl;);
+                    << std::endl << "  traversing deriv: " << dr_id << " ";);
+                if (!m_state_graph.is_seen(dr_id))
+                    STRACE("state_graph", tout << "state(" << dr_id << ") = " << re().pp(dr) << std::endl;);
+                // Add state
                 m_state_graph.add_state(dr_id);
                 STRACE("state_graph", tout << "regex(" << dr_id << ") = " << mk_pp(dr, m) << std::endl;);
                 bool maybecycle = can_be_in_cycle(r, dr);