From 20332c6d3e548b971a98870f7165f2a4a4578fe6 Mon Sep 17 00:00:00 2001
From: Nikolaj Bjorner <nbjorner@microsoft.com>
Date: Tue, 26 Jan 2021 16:28:44 -0800
Subject: [PATCH] adding char decl plugin for separate theory treatment

Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>
---
 src/ast/CMakeLists.txt      |   1 +
 src/ast/seq_decl_plugin.cpp | 363 ----------------------------------
 src/ast/seq_decl_plugin.h   |  38 +---
 src/smt/theory_char.h       |  29 +--
 src/util/CMakeLists.txt     |   1 +
 src/util/zstring.cpp        | 383 ++++++++++++++++++++++++++++++++++++
 src/util/zstring.h          |  57 ++++++
 7 files changed, 452 insertions(+), 420 deletions(-)
 create mode 100644 src/util/zstring.cpp
 create mode 100644 src/util/zstring.h
diff --git a/src/ast/CMakeLists.txt b/src/ast/CMakeLists.txt
index d4c7291f4..b0c9f7724 100644
--- a/src/ast/CMakeLists.txt
+++ b/src/ast/CMakeLists.txt
@@ -14,6 +14,7 @@ z3_add_component(ast
     ast_translation.cpp
     ast_util.cpp
     bv_decl_plugin.cpp
+    char_decl_plugin.cpp
     datatype_decl_plugin.cpp
     decl_collector.cpp
     display_dimacs.cpp
diff --git a/src/ast/seq_decl_plugin.cpp b/src/ast/seq_decl_plugin.cpp
index 1273dba85..05cb4d8ab 100644
--- a/src/ast/seq_decl_plugin.cpp
+++ b/src/ast/seq_decl_plugin.cpp
@@ -24,369 +24,6 @@ Revision History:
 #include "ast/bv_decl_plugin.h"
 #include <sstream>
 
-static bool is_hex_digit(char ch, unsigned& d) {
-    if ('0' <= ch && ch <= '9') {
-        d = ch - '0';
-        return true;
-    }
-    if ('A' <= ch && ch <= 'F') {
-        d = 10 + ch - 'A';
-        return true;
-    }
-    if ('a' <= ch && ch <= 'f') {
-        d = 10 + ch - 'a';
-        return true;
-    }
-    return false;
-}
-
-static bool is_octal_digit(char ch, unsigned& d) {
-    if ('0' <= ch && ch <= '7') {
-        d = ch - '0';
-        return true;
-    }
-    return false;
-}
-
-bool zstring::is_escape_char(char const *& s, unsigned& result) {
-    unsigned d1, d2, d3;
-    if (*s != '\\' || *(s + 1) == 0) {
-        return false;
-    }
-    if (*(s + 1) == 'x' &&
-        is_hex_digit(*(s + 2), d1) && is_hex_digit(*(s + 3), d2)) {
-        result = d1*16 + d2;
-        s += 4;
-        return true;
-    }
-    /* C-standard octal escapes: either 1, 2, or 3 octal digits,
-     * stopping either at 3 digits or at the first non-digit character.
-     */
-    /* 1 octal digit */
-    if (is_octal_digit(*(s + 1), d1) && !is_octal_digit(*(s + 2), d2)) {
-        result = d1;
-        s += 2;
-        return true;
-    }
-    /* 2 octal digits */
-    if (is_octal_digit(*(s + 1), d1) && is_octal_digit(*(s + 2), d2) &&
-        !is_octal_digit(*(s + 3), d3)) {
-        result = d1 * 8 + d2;
-        s += 3;
-        return true;
-    }
-    /* 3 octal digits */
-    if (is_octal_digit(*(s + 1), d1) && is_octal_digit(*(s + 2), d2) &&
-        is_octal_digit(*(s + 3), d3)) {
-        result = d1*64 + d2*8 + d3;
-        s += 4;
-        return true;
-    }
-
-    if (*(s+1) == 'u' && *(s+2) == '{') {
-        result = 0;
-        for (unsigned i = 0; i < 5; ++i) {
-            if (is_hex_digit(*(s+3+i), d1)) {
-                result = 16*result + d1;
-            }
-            else if (*(s+3+i) == '}') {
-                if (result > 255 && !uses_unicode())
-                    throw default_exception("unicode characters outside of byte range are not supported");
-                s += 4 + i;
-                return true;                
-            }
-            else {
-                break;
-            }
-        }
-        return false;
-    }
-    if (*(s+1) == 'u' && is_hex_digit(*(s+2), d1)) {
-        result = d1;
-        unsigned i = 0;
-        for (; i < 4; ++i) {
-            if (is_hex_digit(*(s+3+i), d1)) {
-                result = 16*result + d1;
-            }
-            else {
-                break;
-            }
-        }
-        if (result > 255 && !uses_unicode())
-            throw default_exception("unicode characters outside of byte range are not supported");
-        s += 3 + i;
-        return true;
-    }
-    switch (*(s + 1)) {
-    case 'a':
-        result = '\a';
-        s += 2;
-        return true;
-    case 'b':
-        result = '\b';
-        s += 2;
-        return true;
-#if 0
-    case 'e':
-        result = '\e';
-        s += 2;
-        return true;
-#endif
-    case 'f':
-        result = '\f';
-        s += 2;
-        return true;
-    case 'n':
-        result = '\n';
-        s += 2;
-        return true;
-    case 'r':
-        result = '\r';
-        s += 2;
-        return true;
-    case 't':
-        result = '\t';
-        s += 2;
-        return true;
-    case 'v':
-        result = '\v';
-        s += 2;
-        return true;
-    default:
-        result = *(s + 1);
-        s += 2;
-        return true;
-    }
-    return false;
-}
-
-zstring::zstring(char const* s) {
-    while (*s) {
-        unsigned ch = 0;
-        if (is_escape_char(s, ch)) {
-            m_buffer.push_back(ch);
-        }
-        else {
-            m_buffer.push_back(*s);
-            ++s;
-        }
-    }
-    SASSERT(well_formed());
-}
-
-bool zstring::uses_unicode() const {
-    return gparams::get_value("unicode") == "true";
-}
-
-bool zstring::well_formed() const {
-    for (unsigned ch : m_buffer) {
-        if (ch > unicode_max_char()) {
-            IF_VERBOSE(0, verbose_stream() << "large character: " << ch << "\n";);
-            return false;
-        }
-    }
-    return true;
-}
-
-zstring::zstring(unsigned ch) {
-    m_buffer.push_back(ch);
-}
-
-zstring zstring::reverse() const {
-    zstring result;
-    for (unsigned i = length(); i-- > 0; ) {
-        result.m_buffer.push_back(m_buffer[i]);
-    }
-    return result;
-}
-
-zstring zstring::replace(zstring const& src, zstring const& dst) const {
-    zstring result;
-    if (length() < src.length()) {
-        return zstring(*this);
-    }
-    if (src.length() == 0) {
-        return dst + zstring(*this);
-    }
-    bool found = false;
-    for (unsigned i = 0; i < length(); ++i) {
-        bool eq = !found && i + src.length() <= length();
-        for (unsigned j = 0; eq && j < src.length(); ++j) {
-            eq = m_buffer[i+j] == src[j];
-        }
-        if (eq) {
-            result.m_buffer.append(dst.m_buffer);
-            found = true;
-            i += src.length() - 1;
-        }
-        else {
-            result.m_buffer.push_back(m_buffer[i]);
-        }
-    }
-    return result;
-}
-
-static const char esc_table[32][6] =
-    { "\\x00", "\\x01", "\\x02", "\\x03", "\\x04", "\\x05", "\\x06", "\\x07", "\\x08", "\\x09", "\\n",   "\\v",   "\\f",   "\\r",   "\\x0E", "\\x0F",
-      "\\x10", "\\x11", "\\x12", "\\x13", "\\x14", "\\x15", "\\x16", "\\x17", "\\x18", "\\x19", "\\x1A", "\\x1B", "\\x1C", "\\x1D", "\\x1E", "\\x1F"
-};
-
-std::string zstring::encode() const {
-    std::ostringstream strm;
-    char buffer[100];
-    unsigned offset = 0;
-#define _flush() if (offset > 0) { buffer[offset] = 0; strm << buffer; offset = 0; }
-    for (unsigned i = 0; i < m_buffer.size(); ++i) {
-        unsigned ch = m_buffer[i];
-        if (0 <= ch && ch < 32) {
-            _flush();
-            strm << esc_table[ch];
-        }
-        else if (ch == '\\') {
-            _flush();
-            strm << "\\\\";
-        }
-        else if (ch >= 256) {
-            _flush();
-            strm << "\\u{" << std::hex << ch << std::dec << "}";
-        }
-        else if (ch >= 128) {
-            _flush();
-            strm << "\\x" << std::hex << ch << std::dec; 
-        }
-        else {
-            if (offset == 99) { 
-                _flush();
-            }
-            buffer[offset++] = (char)ch;
-        }
-    }
-    _flush();
-    return strm.str();
-}
-
-bool zstring::suffixof(zstring const& other) const {
-    if (length() > other.length()) return false;
-    bool suffix = true;
-    for (unsigned i = 0; suffix && i < length(); ++i) {
-        suffix = m_buffer[length()-i-1] == other[other.length()-i-1];
-    }
-    return suffix;
-}
-
-bool zstring::prefixof(zstring const& other) const {
-    if (length() > other.length()) return false;
-    bool prefix = true;
-    for (unsigned i = 0; prefix && i < length(); ++i) {
-        prefix = m_buffer[i] == other[i];
-    }
-    return prefix;
-}
-
-bool zstring::contains(zstring const& other) const {
-    if (other.length() > length()) return false;
-    unsigned last = length() - other.length();
-    bool cont = false;
-    for (unsigned i = 0; !cont && i <= last; ++i) {
-        cont = true;
-        for (unsigned j = 0; cont && j < other.length(); ++j) {
-            cont = other[j] == m_buffer[j+i];
-        }
-    }
-    return cont;
-}
-
-int zstring::indexofu(zstring const& other, unsigned offset) const {
-    if (offset <= length() && other.length() == 0) return offset;
-    if (offset == length()) return -1;
-    if (offset > other.length() + offset) return -1;
-    if (other.length() + offset > length()) return -1;
-    unsigned last = length() - other.length();
-    for (unsigned i = offset; i <= last; ++i) {
-        bool prefix = true;
-        for (unsigned j = 0; prefix && j < other.length(); ++j) {
-            prefix = m_buffer[i + j] == other[j];
-        }
-        if (prefix) {
-            return static_cast<int>(i);
-        }
-    }
-    return -1;
-}
-
-int zstring::last_indexof(zstring const& other) const {
-    if (other.length() == 0) return length();
-    if (other.length() > length()) return -1;
-    for (unsigned last = length() - other.length(); last-- > 0; ) {
-        bool suffix = true;
-        for (unsigned j = 0; suffix && j < other.length(); ++j) {
-            suffix = m_buffer[last + j] == other[j];
-        }
-        if (suffix) {
-            return static_cast<int>(last);
-        }
-    }
-    return -1;
-}
-
-zstring zstring::extract(unsigned offset, unsigned len) const {
-    zstring result;
-    if (offset + len < offset) return result;
-    int last = std::min(offset+len, length());
-    for (int i = offset; i < last; ++i) {
-        result.m_buffer.push_back(m_buffer[i]);
-    }
-    return result;
-}
-
-zstring zstring::operator+(zstring const& other) const {
-    zstring result(*this);
-    result.m_buffer.append(other.m_buffer);
-    return result;
-}
-
-bool zstring::operator==(const zstring& other) const {
-    // two strings are equal iff they have the same length and characters
-    if (length() != other.length()) {
-        return false;
-    }
-    for (unsigned i = 0; i < length(); ++i) {
-        if (m_buffer[i] != other[i]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool zstring::operator!=(const zstring& other) const {
-    return !(*this == other);
-}
-
-std::ostream& operator<<(std::ostream &os, const zstring &str) {
-    return os << str.encode();
-}
-
-bool operator<(const zstring& lhs, const zstring& rhs) {
-    // This has the same semantics as strcmp()
-    unsigned len = lhs.length();
-    if (rhs.length() < len) {
-        len = rhs.length();
-    }
-    for (unsigned i = 0; i < len; ++i) {
-        unsigned Li = lhs[i];
-        unsigned Ri = rhs[i];
-        if (Li < Ri) {
-            return true;
-        } 
-        else if (Li > Ri) {
-            return false;
-        } 
-    }
-    // at this point, all compared characters are equal,
-    // so decide based on the relative lengths
-    return lhs.length() < rhs.length();
-}
-
 
 seq_decl_plugin::seq_decl_plugin(): m_init(false),
                                     m_stringc_sym("String"),
diff --git a/src/ast/seq_decl_plugin.h b/src/ast/seq_decl_plugin.h
index 4fdc8a065..3cc1abc8f 100644
--- a/src/ast/seq_decl_plugin.h
+++ b/src/ast/seq_decl_plugin.h
@@ -24,8 +24,8 @@ Revision History:
 
 #include "ast/ast.h"
 #include "ast/bv_decl_plugin.h"
-#include <string>
 #include "util/lbool.h"
+#include "util/zstring.h"
 
 enum seq_sort_kind {
     SEQ_SORT,
@@ -110,42 +110,6 @@ enum seq_op_kind {
 };
 
 
-class zstring {
-private:
-    buffer<unsigned> m_buffer;
-    bool well_formed() const;
-    bool uses_unicode() const;
-    bool is_escape_char(char const *& s, unsigned& result);
-public:
-    static unsigned unicode_max_char() { return 196607; }
-    static unsigned unicode_num_bits() { return 18; }
-    static unsigned ascii_max_char() { return 255; }
-    static unsigned ascii_num_bits() { return 8; }
-    zstring() {}
-    zstring(char const* s);
-    zstring(const std::string &str) : zstring(str.c_str()) {}
-    zstring(unsigned sz, unsigned const* s) { m_buffer.append(sz, s); SASSERT(well_formed()); }
-    zstring(unsigned ch);
-    zstring replace(zstring const& src, zstring const& dst) const;
-    zstring reverse() const;
-    std::string encode() const;
-    unsigned length() const { return m_buffer.size(); }
-    unsigned operator[](unsigned i) const { return m_buffer[i]; }
-    bool empty() const { return m_buffer.empty(); }
-    bool suffixof(zstring const& other) const;
-    bool prefixof(zstring const& other) const;
-    bool contains(zstring const& other) const;
-    int  indexofu(zstring const& other, unsigned offset) const;
-    int  last_indexof(zstring const& other) const;
-    zstring extract(unsigned lo, unsigned hi) const;
-    zstring operator+(zstring const& other) const;
-    bool operator==(const zstring& other) const;
-    bool operator!=(const zstring& other) const;
-
-    friend std::ostream& operator<<(std::ostream &os, const zstring &str);
-    friend bool operator<(const zstring& lhs, const zstring& rhs);
-};
-
 class seq_decl_plugin : public decl_plugin {
     struct psig {
         symbol          m_name;
diff --git a/src/smt/theory_char.h b/src/smt/theory_char.h
index 6a829487a..9e1664c27 100644
--- a/src/smt/theory_char.h
+++ b/src/smt/theory_char.h
@@ -34,34 +34,26 @@ namespace smt {
             void reset() { memset(this, 0, sizeof(*this)); }
         };
 
-        seq_util         seq;
+        seq_util                seq;
         vector<literal_vector>  m_bits;
         vector<expr_ref_vector> m_ebits;
-        unsigned_vector  m_var2value;
-        svector<theory_var> m_value2var;
-        bool             m_enabled { false };
-        bit_blaster      m_bb;
-        stats            m_stats;
-        symbol           m_bits2char;
-        seq_factory*     m_factory { nullptr };
+        unsigned_vector         m_var2value;
+        svector<theory_var>     m_value2var;
+        bool                    m_enabled { false };
+        bit_blaster             m_bb;
+        stats                   m_stats;
+        symbol                  m_bits2char;
+        seq_factory*            m_factory { nullptr };
 
         struct reset_bits;
 
-
         literal_vector const& get_bits(theory_var v);
-
         expr_ref_vector const& get_ebits(theory_var v);
-
         bool has_bits(theory_var v) const;
-
         void init_bits(theory_var v);
-
-        bool get_char_value(theory_var v, unsigned& c);
-        
+        bool get_char_value(theory_var v, unsigned& c);        
         void enforce_ackerman(theory_var v, theory_var w);
-
         void enforce_value_bound(theory_var v);
-
         void enforce_bits();
 
     public:
@@ -83,9 +75,6 @@ namespace smt {
         // ensure coherence for character codes and equalities of shared symbols.
         bool enabled() const { return m_enabled; }
         bool final_check();
-        // <= atomic constraints on characters
-        void assign_le(theory_var v1, theory_var v2, literal lit);
-        void assign_lt(theory_var v1, theory_var v2, literal lit);
         void new_const_char(theory_var v, unsigned c);
         unsigned get_char_value(theory_var v);
 
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index 2b09230b9..35727bce2 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -60,6 +60,7 @@ z3_add_component(util
     util.cpp
     warning.cpp
     z3_exception.cpp
+    zstring.cpp
   EXTRA_REGISTER_MODULE_HEADERS
     env_params.h
   MEMORY_INIT_FINALIZER_HEADERS
diff --git a/src/util/zstring.cpp b/src/util/zstring.cpp
new file mode 100644
index 000000000..73e195a3c
--- /dev/null
+++ b/src/util/zstring.cpp
@@ -0,0 +1,383 @@
+/*++
+Copyright (c) 2011 Microsoft Corporation
+
+Module Name:
+
+    zstring.cpp
+
+Abstract:
+
+    String wrapper for unicode/ascii internal strings as vectors
+
+Author:
+
+    Nikolaj Bjorner (nbjorner) 2021-01-26
+
+--*/
+#include "util/gparams.h"
+#include "util/zstring.h"
+
+static bool is_hex_digit(char ch, unsigned& d) {
+    if ('0' <= ch && ch <= '9') {
+        d = ch - '0';
+        return true;
+    }
+    if ('A' <= ch && ch <= 'F') {
+        d = 10 + ch - 'A';
+        return true;
+    }
+    if ('a' <= ch && ch <= 'f') {
+        d = 10 + ch - 'a';
+        return true;
+    }
+    return false;
+}
+
+static bool is_octal_digit(char ch, unsigned& d) {
+    if ('0' <= ch && ch <= '7') {
+        d = ch - '0';
+        return true;
+    }
+    return false;
+}
+
+bool zstring::is_escape_char(char const *& s, unsigned& result) {
+    unsigned d1, d2, d3;
+    if (*s != '\\' || *(s + 1) == 0) {
+        return false;
+    }
+    if (*(s + 1) == 'x' &&
+        is_hex_digit(*(s + 2), d1) && is_hex_digit(*(s + 3), d2)) {
+        result = d1*16 + d2;
+        s += 4;
+        return true;
+    }
+    /* C-standard octal escapes: either 1, 2, or 3 octal digits,
+     * stopping either at 3 digits or at the first non-digit character.
+     */
+    /* 1 octal digit */
+    if (is_octal_digit(*(s + 1), d1) && !is_octal_digit(*(s + 2), d2)) {
+        result = d1;
+        s += 2;
+        return true;
+    }
+    /* 2 octal digits */
+    if (is_octal_digit(*(s + 1), d1) && is_octal_digit(*(s + 2), d2) &&
+        !is_octal_digit(*(s + 3), d3)) {
+        result = d1 * 8 + d2;
+        s += 3;
+        return true;
+    }
+    /* 3 octal digits */
+    if (is_octal_digit(*(s + 1), d1) && is_octal_digit(*(s + 2), d2) &&
+        is_octal_digit(*(s + 3), d3)) {
+        result = d1*64 + d2*8 + d3;
+        s += 4;
+        return true;
+    }
+
+    if (*(s+1) == 'u' && *(s+2) == '{') {
+        result = 0;
+        for (unsigned i = 0; i < 5; ++i) {
+            if (is_hex_digit(*(s+3+i), d1)) {
+                result = 16*result + d1;
+            }
+            else if (*(s+3+i) == '}') {
+                if (result > 255 && !uses_unicode())
+                    throw default_exception("unicode characters outside of byte range are not supported");
+                s += 4 + i;
+                return true;                
+            }
+            else {
+                break;
+            }
+        }
+        return false;
+    }
+    if (*(s+1) == 'u' && is_hex_digit(*(s+2), d1)) {
+        result = d1;
+        unsigned i = 0;
+        for (; i < 4; ++i) {
+            if (is_hex_digit(*(s+3+i), d1)) {
+                result = 16*result + d1;
+            }
+            else {
+                break;
+            }
+        }
+        if (result > 255 && !uses_unicode())
+            throw default_exception("unicode characters outside of byte range are not supported");
+        s += 3 + i;
+        return true;
+    }
+    switch (*(s + 1)) {
+    case 'a':
+        result = '\a';
+        s += 2;
+        return true;
+    case 'b':
+        result = '\b';
+        s += 2;
+        return true;
+#if 0
+    case 'e':
+        result = '\e';
+        s += 2;
+        return true;
+#endif
+    case 'f':
+        result = '\f';
+        s += 2;
+        return true;
+    case 'n':
+        result = '\n';
+        s += 2;
+        return true;
+    case 'r':
+        result = '\r';
+        s += 2;
+        return true;
+    case 't':
+        result = '\t';
+        s += 2;
+        return true;
+    case 'v':
+        result = '\v';
+        s += 2;
+        return true;
+    default:
+        result = *(s + 1);
+        s += 2;
+        return true;
+    }
+    return false;
+}
+
+zstring::zstring(char const* s) {
+    while (*s) {
+        unsigned ch = 0;
+        if (is_escape_char(s, ch)) {
+            m_buffer.push_back(ch);
+        }
+        else {
+            m_buffer.push_back(*s);
+            ++s;
+        }
+    }
+    SASSERT(well_formed());
+}
+
+bool zstring::uses_unicode() const {
+    return gparams::get_value("unicode") == "true";
+}
+
+bool zstring::well_formed() const {
+    for (unsigned ch : m_buffer) {
+        if (ch > unicode_max_char()) {
+            IF_VERBOSE(0, verbose_stream() << "large character: " << ch << "\n";);
+            return false;
+        }
+    }
+    return true;
+}
+
+zstring::zstring(unsigned ch) {
+    m_buffer.push_back(ch);
+}
+
+zstring zstring::reverse() const {
+    zstring result;
+    for (unsigned i = length(); i-- > 0; ) {
+        result.m_buffer.push_back(m_buffer[i]);
+    }
+    return result;
+}
+
+zstring zstring::replace(zstring const& src, zstring const& dst) const {
+    zstring result;
+    if (length() < src.length()) {
+        return zstring(*this);
+    }
+    if (src.length() == 0) {
+        return dst + zstring(*this);
+    }
+    bool found = false;
+    for (unsigned i = 0; i < length(); ++i) {
+        bool eq = !found && i + src.length() <= length();
+        for (unsigned j = 0; eq && j < src.length(); ++j) {
+            eq = m_buffer[i+j] == src[j];
+        }
+        if (eq) {
+            result.m_buffer.append(dst.m_buffer);
+            found = true;
+            i += src.length() - 1;
+        }
+        else {
+            result.m_buffer.push_back(m_buffer[i]);
+        }
+    }
+    return result;
+}
+
+static const char esc_table[32][6] =
+    { "\\x00", "\\x01", "\\x02", "\\x03", "\\x04", "\\x05", "\\x06", "\\x07", "\\x08", "\\x09", "\\n",   "\\v",   "\\f",   "\\r",   "\\x0E", "\\x0F",
+      "\\x10", "\\x11", "\\x12", "\\x13", "\\x14", "\\x15", "\\x16", "\\x17", "\\x18", "\\x19", "\\x1A", "\\x1B", "\\x1C", "\\x1D", "\\x1E", "\\x1F"
+};
+
+std::string zstring::encode() const {
+    std::ostringstream strm;
+    char buffer[100];
+    unsigned offset = 0;
+#define _flush() if (offset > 0) { buffer[offset] = 0; strm << buffer; offset = 0; }
+    for (unsigned i = 0; i < m_buffer.size(); ++i) {
+        unsigned ch = m_buffer[i];
+        if (0 <= ch && ch < 32) {
+            _flush();
+            strm << esc_table[ch];
+        }
+        else if (ch == '\\') {
+            _flush();
+            strm << "\\\\";
+        }
+        else if (ch >= 256) {
+            _flush();
+            strm << "\\u{" << std::hex << ch << std::dec << "}";
+        }
+        else if (ch >= 128) {
+            _flush();
+            strm << "\\x" << std::hex << ch << std::dec; 
+        }
+        else {
+            if (offset == 99) { 
+                _flush();
+            }
+            buffer[offset++] = (char)ch;
+        }
+    }
+    _flush();
+    return strm.str();
+}
+
+bool zstring::suffixof(zstring const& other) const {
+    if (length() > other.length()) return false;
+    bool suffix = true;
+    for (unsigned i = 0; suffix && i < length(); ++i) {
+        suffix = m_buffer[length()-i-1] == other[other.length()-i-1];
+    }
+    return suffix;
+}
+
+bool zstring::prefixof(zstring const& other) const {
+    if (length() > other.length()) return false;
+    bool prefix = true;
+    for (unsigned i = 0; prefix && i < length(); ++i) {
+        prefix = m_buffer[i] == other[i];
+    }
+    return prefix;
+}
+
+bool zstring::contains(zstring const& other) const {
+    if (other.length() > length()) return false;
+    unsigned last = length() - other.length();
+    bool cont = false;
+    for (unsigned i = 0; !cont && i <= last; ++i) {
+        cont = true;
+        for (unsigned j = 0; cont && j < other.length(); ++j) {
+            cont = other[j] == m_buffer[j+i];
+        }
+    }
+    return cont;
+}
+
+int zstring::indexofu(zstring const& other, unsigned offset) const {
+    if (offset <= length() && other.length() == 0) return offset;
+    if (offset == length()) return -1;
+    if (offset > other.length() + offset) return -1;
+    if (other.length() + offset > length()) return -1;
+    unsigned last = length() - other.length();
+    for (unsigned i = offset; i <= last; ++i) {
+        bool prefix = true;
+        for (unsigned j = 0; prefix && j < other.length(); ++j) {
+            prefix = m_buffer[i + j] == other[j];
+        }
+        if (prefix) {
+            return static_cast<int>(i);
+        }
+    }
+    return -1;
+}
+
+int zstring::last_indexof(zstring const& other) const {
+    if (other.length() == 0) return length();
+    if (other.length() > length()) return -1;
+    for (unsigned last = length() - other.length(); last-- > 0; ) {
+        bool suffix = true;
+        for (unsigned j = 0; suffix && j < other.length(); ++j) {
+            suffix = m_buffer[last + j] == other[j];
+        }
+        if (suffix) {
+            return static_cast<int>(last);
+        }
+    }
+    return -1;
+}
+
+zstring zstring::extract(unsigned offset, unsigned len) const {
+    zstring result;
+    if (offset + len < offset) return result;
+    int last = std::min(offset+len, length());
+    for (int i = offset; i < last; ++i) {
+        result.m_buffer.push_back(m_buffer[i]);
+    }
+    return result;
+}
+
+zstring zstring::operator+(zstring const& other) const {
+    zstring result(*this);
+    result.m_buffer.append(other.m_buffer);
+    return result;
+}
+
+bool zstring::operator==(const zstring& other) const {
+    // two strings are equal iff they have the same length and characters
+    if (length() != other.length()) {
+        return false;
+    }
+    for (unsigned i = 0; i < length(); ++i) {
+        if (m_buffer[i] != other[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool zstring::operator!=(const zstring& other) const {
+    return !(*this == other);
+}
+
+std::ostream& operator<<(std::ostream &os, const zstring &str) {
+    return os << str.encode();
+}
+
+bool operator<(const zstring& lhs, const zstring& rhs) {
+    // This has the same semantics as strcmp()
+    unsigned len = lhs.length();
+    if (rhs.length() < len) {
+        len = rhs.length();
+    }
+    for (unsigned i = 0; i < len; ++i) {
+        unsigned Li = lhs[i];
+        unsigned Ri = rhs[i];
+        if (Li < Ri) {
+            return true;
+        } 
+        else if (Li > Ri) {
+            return false;
+        } 
+    }
+    // at this point, all compared characters are equal,
+    // so decide based on the relative lengths
+    return lhs.length() < rhs.length();
+}
+
+
diff --git a/src/util/zstring.h b/src/util/zstring.h
new file mode 100644
index 000000000..836792252
--- /dev/null
+++ b/src/util/zstring.h
@@ -0,0 +1,57 @@
+/*++
+Copyright (c) 2011 Microsoft Corporation
+
+Module Name:
+
+    zstring.h
+
+Abstract:
+
+    String wrapper for unicode/ascii internal strings as vectors
+
+Author:
+
+    Nikolaj Bjorner (nbjorner) 2021-01-26
+
+--*/
+#pragma once
+
+#include <string>
+#include "util/vector.h"
+#include "util/buffer.h"
+
+class zstring {
+private:
+    buffer<unsigned> m_buffer;
+    bool well_formed() const;
+    bool uses_unicode() const;
+    bool is_escape_char(char const *& s, unsigned& result);
+public:
+    static unsigned unicode_max_char() { return 196607; }
+    static unsigned unicode_num_bits() { return 18; }
+    static unsigned ascii_max_char() { return 255; }
+    static unsigned ascii_num_bits() { return 8; }
+    zstring() {}
+    zstring(char const* s);
+    zstring(const std::string &str) : zstring(str.c_str()) {}
+    zstring(unsigned sz, unsigned const* s) { m_buffer.append(sz, s); SASSERT(well_formed()); }
+    zstring(unsigned ch);
+    zstring replace(zstring const& src, zstring const& dst) const;
+    zstring reverse() const;
+    std::string encode() const;
+    unsigned length() const { return m_buffer.size(); }
+    unsigned operator[](unsigned i) const { return m_buffer[i]; }
+    bool empty() const { return m_buffer.empty(); }
+    bool suffixof(zstring const& other) const;
+    bool prefixof(zstring const& other) const;
+    bool contains(zstring const& other) const;
+    int  indexofu(zstring const& other, unsigned offset) const;
+    int  last_indexof(zstring const& other) const;
+    zstring extract(unsigned lo, unsigned hi) const;
+    zstring operator+(zstring const& other) const;
+    bool operator==(const zstring& other) const;
+    bool operator!=(const zstring& other) const;
+
+    friend std::ostream& operator<<(std::ostream &os, const zstring &str);
+    friend bool operator<(const zstring& lhs, const zstring& rhs);
+};