doxygen/utf8_8h_source.html

/*

 * This program source code file is part of KiCad, a free EDA CAD application.

 *

 * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <[email protected]>

 * Copyright The KiCad Developers, see AUTHORS.txt for contributors.

 *

 * @author Dick Hollenbeck

 *

 * This program is free software; you can redistribute it and/or

 * modify it under the terms of the GNU General Public License

 * as published by the Free Software Foundation; either version 2

 * of the License, or (at your option) any later version.

 *

 * This program is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 * GNU General Public License for more details.

 *

 * You should have received a copy of the GNU General Public License

 * along with this program; if not, you may find one here:

 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html

 * or you may search the http://www.gnu.org website for the version 2 license,

 * or you may write to the Free Software Foundation, Inc.,

 * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA

 */


#ifndef UTF8_H_

#define UTF8_H_


#include <iostream>

#include <string>

#include <wx/string.h>


#if defined(DEBUG)

 #define UTF8_VERIFY    // Might someday be a hidden cmake config option

#endif


bool IsUTF8( const char* aString );


#if defined(UTF8_VERIFY)

 #define MAYBE_VERIFY_UTF8(x)       wxASSERT( IsUTF8(x) )

#else

 #define MAYBE_VERIFY_UTF8(x)       // nothing

#endif


class UTF8

{

public:

    UTF8( const wxString& o );


    UTF8( const char* txt ) :

        m_s( txt )

    {

        MAYBE_VERIFY_UTF8( c_str() );

    }


    UTF8( const wchar_t* txt );


    UTF8( const std::string& o ) :

        m_s( o )

    {

        MAYBE_VERIFY_UTF8( c_str() );

    }


    UTF8( const UTF8& o ) :

        m_s( o.m_s )

    {

        MAYBE_VERIFY_UTF8( c_str() );

    }


    UTF8()

    {

    }


    ~UTF8()     // Needed mainly to build python wrapper

    {

    }


    // expose some std::string functions publicly, since base class must be private.

    const char* c_str()                         const   { return m_s.c_str(); }

    bool empty()                                const   { return m_s.empty(); }


    std::string::size_type find( char c )       const   { return m_s.find( c ); }

    std::string::size_type find( char c, size_t s )     const   { return m_s.find( c, s ); }


    void clear()                                        { m_s.clear(); }

    std::string::size_type length()             const   { return m_s.length(); }

    std::string::size_type size()               const   { return m_s.size(); }

    int compare( const std::string& s )         const   { return m_s.compare( s ); }


    bool operator==( const UTF8& rhs )          const   { return m_s == rhs.m_s; }

    bool operator==( const std::string& rhs )   const   { return m_s == rhs; }

    bool operator==( const char* s )            const   { return m_s == s; }


    bool operator!=( const UTF8& rhs ) const { return !( operator==( rhs ) ); }

    bool operator<( const UTF8& rhs ) const { return m_s < rhs.m_s;  }

    bool operator>( const UTF8& rhs ) const { return m_s > rhs.m_s;  }


    std::string::size_type find_first_of( const std::string& str,

                                          std::string::size_type pos = 0 ) const

    {

        return m_s.find_first_of( str, pos );

    }


    UTF8& operator+=( const UTF8& str )

    {

        m_s += str.m_s;

        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    UTF8& operator+=( char ch )

    {

        m_s.operator+=( ch );

        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    UTF8& operator+=( const char* s )

    {

        m_s.operator+=( s );

        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    UTF8& operator+=( unsigned w_ch );


    // std::string::npos is not constexpr, so we can't use it in an

    // initializer.

    static constexpr std::string::size_type npos = -1;


    UTF8& operator=( const wxString& o );


    UTF8& operator=( const std::string& o )

    {

        m_s = o;

        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    UTF8& operator=( const char* s )

    {

        m_s = s;

        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    UTF8& operator=( char c )

    {

        m_s = c;

        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    UTF8& operator=( const UTF8& aOther )

    {

        m_s = aOther.m_s;

        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    // Move assignment operator


    UTF8& operator=( UTF8&& aOther ) noexcept

    {

        if (this != &aOther)

            m_s = std::move( aOther.m_s );


        MAYBE_VERIFY_UTF8( c_str() );

        return *this;

    }


    // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character

    // was split, so return std::string not UTF8


    std::string substr( size_t pos = 0, size_t len = npos ) const

    {

        return m_s.substr( pos, len );

    }


    operator const std::string& () const    { return m_s; }

    //operator std::string& ()                { return m_s; }

    //operator std::string () const           { return m_s; }


    wxString wx_str() const;

    operator wxString () const;


    // "Read only" iterating over bytes is done with these, use the uni_iter to iterate

    // over UTF8 (multi-byte) characters

    std::string::const_iterator begin()         const   { return m_s.begin(); }

    std::string::const_iterator end()           const   { return m_s.end(); }


#ifndef SWIG


    class uni_iter

    {

    public:


        uni_iter()  // Needed only to build python wrapper, not used outside the wrapper

        {

            it = nullptr;

        }


        uni_iter( const uni_iter& o )

        {

            it = o.it;

        }


        const uni_iter& operator++()

        {

            it += uni_forward( it );

            return *this;

        }


        uni_iter operator++( int )

        {

            uni_iter ret = *this;


            it += uni_forward( it );

            return ret;

        }


        unsigned operator->() const

        {

            unsigned    result;


            // grab the result, do not advance

            uni_forward( it, &result );

            return result;

        }


        unsigned operator*() const

        {

            unsigned    result;


            // grab the result, do not advance

            uni_forward( it, &result );

            return result;

        }


        uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }


        bool operator==( const uni_iter& other ) const  { return it == other.it; }

        bool operator!=( const uni_iter& other ) const  { return it != other.it; }


        bool operator< ( const uni_iter& other ) const  { return it <  other.it; }

        bool operator<=( const uni_iter& other ) const  { return it <= other.it; }

        bool operator> ( const uni_iter& other ) const  { return it >  other.it; }

        bool operator>=( const uni_iter& other ) const  { return it >= other.it; }


    private:

        friend class UTF8;


        const unsigned char* it;


        // private constructor


        uni_iter( const char* start ) :

            it( (const unsigned char*) start )

        {

        }


    };


    uni_iter ubegin() const

    {

        return uni_iter( m_s.data() );

    }


    uni_iter uend() const

    {

        return uni_iter( m_s.data() + m_s.size() );

    }


    static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );

#endif  // SWIG


protected:

    std::string m_s;

};


std::ostream& operator<<( std::ostream& aStream, const UTF8& aRhs );


#endif // UTF8_H_

UTF8::uni_iter
uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded string...
Definition utf8.h:231

UTF8::uni_iter::uni_iter
uni_iter(const uni_iter &o)
Definition utf8.h:238

UTF8::uni_iter::operator*
unsigned operator*() const
return unicode at current position
Definition utf8.h:270

UTF8::uni_iter::operator!=
bool operator!=(const uni_iter &other) const
Definition utf8.h:282

UTF8::uni_iter::uni_iter
uni_iter(const char *start)
Definition utf8.h:297

UTF8::uni_iter::UTF8
friend class UTF8
Definition utf8.h:292

UTF8::uni_iter::it
const unsigned char * it
Definition utf8.h:294

UTF8::uni_iter::operator<
bool operator<(const uni_iter &other) const
Since the ++ operators advance more than one byte, this is your best loop termination test,...
Definition utf8.h:286

UTF8::uni_iter::operator-
uni_iter operator-(int aVal) const
Definition utf8.h:279

UTF8::uni_iter::operator>=
bool operator>=(const uni_iter &other) const
Definition utf8.h:289

UTF8::uni_iter::uni_iter
uni_iter()
Definition utf8.h:233

UTF8::uni_iter::operator<=
bool operator<=(const uni_iter &other) const
Definition utf8.h:287

UTF8::uni_iter::operator==
bool operator==(const uni_iter &other) const
Definition utf8.h:281

UTF8::uni_iter::operator>
bool operator>(const uni_iter &other) const
Definition utf8.h:288

UTF8::uni_iter::operator++
uni_iter operator++(int)
post-increment and return uni_iter at initial position
Definition utf8.h:251

UTF8::uni_iter::operator++
const uni_iter & operator++()
pre-increment and return uni_iter at new position
Definition utf8.h:244

UTF8::uni_iter::operator->
unsigned operator->() const
return unicode at current position
Definition utf8.h:260

UTF8
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition utf8.h:71

UTF8::operator+=
UTF8 & operator+=(const UTF8 &str)
Definition utf8.h:133

UTF8::npos
static constexpr std::string::size_type npos
Definition utf8.h:161

UTF8::begin
std::string::const_iterator begin() const
Definition utf8.h:219

UTF8::operator=
UTF8 & operator=(UTF8 &&aOther) noexcept
Definition utf8.h:194

UTF8::operator=
UTF8 & operator=(const std::string &o)
Definition utf8.h:165

UTF8::length
std::string::size_type length() const
Definition utf8.h:115

UTF8::m_s
std::string m_s
Definition utf8.h:331

UTF8::find
std::string::size_type find(char c) const
Definition utf8.h:111

UTF8::clear
void clear()
Definition utf8.h:114

UTF8::~UTF8
~UTF8()
Definition utf8.h:103

UTF8::operator=
UTF8 & operator=(const UTF8 &aOther)
Definition utf8.h:186

UTF8::empty
bool empty() const
Definition utf8.h:109

UTF8::operator!=
bool operator!=(const UTF8 &rhs) const
Definition utf8.h:123

UTF8::substr
std::string substr(size_t pos=0, size_t len=npos) const
Definition utf8.h:205

UTF8::find
std::string::size_type find(char c, size_t s) const
Definition utf8.h:112

UTF8::uend
uni_iter uend() const
Return a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition utf8.h:314

UTF8::operator==
bool operator==(const std::string &rhs) const
Definition utf8.h:120

UTF8::uni_forward
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition utf8.cpp:67

UTF8::UTF8
UTF8(const std::string &o)
Definition utf8.h:87

UTF8::UTF8
UTF8(const UTF8 &o)
Definition utf8.h:93

UTF8::operator+=
UTF8 & operator+=(const char *s)
Definition utf8.h:147

UTF8::compare
int compare(const std::string &s) const
Definition utf8.h:117

UTF8::find_first_of
std::string::size_type find_first_of(const std::string &str, std::string::size_type pos=0) const
Definition utf8.h:127

UTF8::operator+=
UTF8 & operator+=(char ch)
Definition utf8.h:140

UTF8::UTF8
UTF8(const char *txt)
This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault...
Definition utf8.h:77

UTF8::c_str
const char * c_str() const
Definition utf8.h:108

UTF8::operator<
bool operator<(const UTF8 &rhs) const
Definition utf8.h:124

UTF8::size
std::string::size_type size() const
Definition utf8.h:116

UTF8::wx_str
wxString wx_str() const
Definition utf8.cpp:45

UTF8::operator=
UTF8 & operator=(char c)
Definition utf8.h:179

UTF8::ubegin
uni_iter ubegin() const
Returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition utf8.h:306

UTF8::operator=
UTF8 & operator=(const char *s)
Definition utf8.h:172

UTF8::operator==
bool operator==(const char *s) const
Definition utf8.h:121

UTF8::UTF8
UTF8(const wxString &o)
Definition utf8.cpp:39

UTF8::end
std::string::const_iterator end() const
Definition utf8.h:220

UTF8::operator=
UTF8 & operator=(const wxString &o)
Definition utf8.cpp:57

UTF8::operator==
bool operator==(const UTF8 &rhs) const
Definition utf8.h:119

UTF8::UTF8
UTF8()
Definition utf8.h:99

UTF8::operator>
bool operator>(const UTF8 &rhs) const
Definition utf8.h:125

result
wxString result
Test unit parsing edge cases and error handling.
Definition test_text_eval_numeric_compat.cpp:602

MAYBE_VERIFY_UTF8
#define MAYBE_VERIFY_UTF8(x)
Definition utf8.h:50

IsUTF8
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition utf8.cpp:179

operator<<
std::ostream & operator<<(std::ostream &aStream, const UTF8 &aRhs)
Definition utf8.cpp:244