KiCad PCB EDA Suite
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1/*
2 * This program source code file is part of KiCad, a free EDA CAD application.
3 *
4 * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <[email protected]>
5 * Copyright The KiCad Developers, see AUTHORS.txt for contributors.
6 *
7 * @author Dick Hollenbeck
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2
12 * of the License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program. If not, see <https://www.gnu.org/licenses/>.
21 */
22
23#ifndef UTF8_H_
24#define UTF8_H_
25
26#include <iostream>
27#include <string>
28#include <wx/string.h>
29
30#if defined(DEBUG)
31 #define UTF8_VERIFY // Might someday be a hidden cmake config option
32#endif
33
34
40bool IsUTF8( const char* aString );
41
42
43#if defined(UTF8_VERIFY)
44 #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
45#else
46 #define MAYBE_VERIFY_UTF8(x) // nothing
47#endif
48
49
66class UTF8
67{
68public:
69 UTF8( const wxString& o );
70
73 UTF8( const char* txt ) :
74 m_s( txt )
75 {
77 }
78
81 UTF8( const wchar_t* txt );
82
83 UTF8( const std::string& o ) :
84 m_s( o )
85 {
87 }
88
89 UTF8( const UTF8& o ) :
90 m_s( o.m_s )
91 {
93 }
94
96 {
97 }
98
99 ~UTF8() // Needed mainly to build python wrapper
100 {
101 }
102
103 // expose some std::string functions publicly, since base class must be private.
104 const char* c_str() const { return m_s.c_str(); }
105 bool empty() const { return m_s.empty(); }
106
107 std::string::size_type find( char c ) const { return m_s.find( c ); }
108 std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); }
109
110 void clear() { m_s.clear(); }
111 std::string::size_type length() const { return m_s.length(); }
112 std::string::size_type size() const { return m_s.size(); }
113 int compare( const std::string& s ) const { return m_s.compare( s ); }
114
115 bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
116 bool operator==( const std::string& rhs ) const { return m_s == rhs; }
117 bool operator==( const char* s ) const { return m_s == s; }
118
119 bool operator!=( const UTF8& rhs ) const { return !( operator==( rhs ) ); }
120 bool operator<( const UTF8& rhs ) const { return m_s < rhs.m_s; }
121 bool operator>( const UTF8& rhs ) const { return m_s > rhs.m_s; }
122
123 std::string::size_type find_first_of( const std::string& str,
124 std::string::size_type pos = 0 ) const
125 {
126 return m_s.find_first_of( str, pos );
127 }
128
129 UTF8& operator+=( const UTF8& str )
130 {
131 m_s += str.m_s;
133 return *this;
134 }
135
136 UTF8& operator+=( char ch )
137 {
138 m_s.operator+=( ch );
140 return *this;
141 }
142
143 UTF8& operator+=( const char* s )
144 {
145 m_s.operator+=( s );
147 return *this;
148 }
149
153 UTF8& operator+=( unsigned w_ch );
154
155 // std::string::npos is not constexpr, so we can't use it in an
156 // initializer.
157 static constexpr std::string::size_type npos = -1;
158
159 UTF8& operator=( const wxString& o );
160
161 UTF8& operator=( const std::string& o )
162 {
163 m_s = o;
165 return *this;
166 }
167
168 UTF8& operator=( const char* s )
169 {
170 m_s = s;
172 return *this;
173 }
174
175 UTF8& operator=( char c )
176 {
177 m_s = c;
179 return *this;
180 }
181
182 UTF8& operator=( const UTF8& aOther )
183 {
184 m_s = aOther.m_s;
186 return *this;
187 }
188
189 // Move assignment operator
190 UTF8& operator=( UTF8&& aOther ) noexcept
191 {
192 if (this != &aOther)
193 m_s = std::move( aOther.m_s );
194
196 return *this;
197 }
198
199 // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
200 // was split, so return std::string not UTF8
201 std::string substr( size_t pos = 0, size_t len = npos ) const
202 {
203 return m_s.substr( pos, len );
204 }
205
206 operator const std::string& () const { return m_s; }
207 //operator std::string& () { return m_s; }
208 //operator std::string () const { return m_s; }
209
210 wxString wx_str() const;
211 operator wxString () const;
212
213 // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
214 // over UTF8 (multi-byte) characters
215 std::string::const_iterator begin() const { return m_s.begin(); }
216 std::string::const_iterator end() const { return m_s.end(); }
217
226 {
227 public:
228 uni_iter() // Needed only to build python wrapper, not used outside the wrapper
229 {
230 it = nullptr;
231 }
232
233 uni_iter( const uni_iter& o )
234 {
235 it = o.it;
236 }
237
240 {
241 it += uni_forward( it );
242 return *this;
243 }
244
247 {
248 uni_iter ret = *this;
249
250 it += uni_forward( it );
251 return ret;
252 }
253
255 unsigned operator->() const
256 {
257 unsigned result;
258
259 // grab the result, do not advance
260 uni_forward( it, &result );
261 return result;
262 }
263
265 unsigned operator*() const
266 {
267 unsigned result;
268
269 // grab the result, do not advance
270 uni_forward( it, &result );
271 return result;
272 }
273
274 uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
275
276 bool operator==( const uni_iter& other ) const { return it == other.it; }
277 bool operator!=( const uni_iter& other ) const { return it != other.it; }
278
281 bool operator< ( const uni_iter& other ) const { return it < other.it; }
282 bool operator<=( const uni_iter& other ) const { return it <= other.it; }
283 bool operator> ( const uni_iter& other ) const { return it > other.it; }
284 bool operator>=( const uni_iter& other ) const { return it >= other.it; }
285
286 private:
287 friend class UTF8;
288
289 const unsigned char* it;
290
291 // private constructor
292 uni_iter( const char* start ) :
293 it( (const unsigned char*) start )
294 {
295 }
296 };
297
302 {
303 return uni_iter( m_s.data() );
304 }
305
310 {
311 return uni_iter( m_s.data() + m_s.size() );
312 }
313
322 static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );
323
324protected:
325 std::string m_s;
326};
327
328
329std::ostream& operator<<( std::ostream& aStream, const UTF8& aRhs );
330
331#endif // UTF8_H_
uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded string...
Definition utf8.h:226
uni_iter(const uni_iter &o)
Definition utf8.h:233
unsigned operator*() const
return unicode at current position
Definition utf8.h:265
bool operator!=(const uni_iter &other) const
Definition utf8.h:277
uni_iter(const char *start)
Definition utf8.h:292
friend class UTF8
Definition utf8.h:287
const unsigned char * it
Definition utf8.h:289
bool operator<(const uni_iter &other) const
Since the ++ operators advance more than one byte, this is your best loop termination test,...
Definition utf8.h:281
uni_iter operator-(int aVal) const
Definition utf8.h:274
bool operator>=(const uni_iter &other) const
Definition utf8.h:284
bool operator<=(const uni_iter &other) const
Definition utf8.h:282
bool operator==(const uni_iter &other) const
Definition utf8.h:276
bool operator>(const uni_iter &other) const
Definition utf8.h:283
uni_iter operator++(int)
post-increment and return uni_iter at initial position
Definition utf8.h:246
const uni_iter & operator++()
pre-increment and return uni_iter at new position
Definition utf8.h:239
unsigned operator->() const
return unicode at current position
Definition utf8.h:255
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition utf8.h:67
UTF8 & operator+=(const UTF8 &str)
Definition utf8.h:129
static constexpr std::string::size_type npos
Definition utf8.h:157
std::string::const_iterator begin() const
Definition utf8.h:215
UTF8 & operator=(UTF8 &&aOther) noexcept
Definition utf8.h:190
UTF8 & operator=(const std::string &o)
Definition utf8.h:161
std::string::size_type length() const
Definition utf8.h:111
std::string m_s
Definition utf8.h:325
std::string::size_type find(char c) const
Definition utf8.h:107
void clear()
Definition utf8.h:110
~UTF8()
Definition utf8.h:99
UTF8 & operator=(const UTF8 &aOther)
Definition utf8.h:182
bool empty() const
Definition utf8.h:105
bool operator!=(const UTF8 &rhs) const
Definition utf8.h:119
std::string substr(size_t pos=0, size_t len=npos) const
Definition utf8.h:201
std::string::size_type find(char c, size_t s) const
Definition utf8.h:108
uni_iter uend() const
Return a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition utf8.h:309
bool operator==(const std::string &rhs) const
Definition utf8.h:116
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition utf8.cpp:63
UTF8(const std::string &o)
Definition utf8.h:83
UTF8(const UTF8 &o)
Definition utf8.h:89
UTF8 & operator+=(const char *s)
Definition utf8.h:143
int compare(const std::string &s) const
Definition utf8.h:113
std::string::size_type find_first_of(const std::string &str, std::string::size_type pos=0) const
Definition utf8.h:123
UTF8 & operator+=(char ch)
Definition utf8.h:136
UTF8(const char *txt)
This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault...
Definition utf8.h:73
const char * c_str() const
Definition utf8.h:104
bool operator<(const UTF8 &rhs) const
Definition utf8.h:120
std::string::size_type size() const
Definition utf8.h:112
wxString wx_str() const
Definition utf8.cpp:41
UTF8 & operator=(char c)
Definition utf8.h:175
uni_iter ubegin() const
Returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition utf8.h:301
UTF8 & operator=(const char *s)
Definition utf8.h:168
bool operator==(const char *s) const
Definition utf8.h:117
UTF8(const wxString &o)
Definition utf8.cpp:35
std::string::const_iterator end() const
Definition utf8.h:216
UTF8 & operator=(const wxString &o)
Definition utf8.cpp:53
bool operator==(const UTF8 &rhs) const
Definition utf8.h:115
UTF8()
Definition utf8.h:95
bool operator>(const UTF8 &rhs) const
Definition utf8.h:121
wxString result
Test unit parsing edge cases and error handling.
#define MAYBE_VERIFY_UTF8(x)
Definition utf8.h:46
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition utf8.cpp:175
std::ostream & operator<<(std::ostream &aStream, const UTF8 &aRhs)
Definition utf8.cpp:240