KiCad PCB EDA Suite
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1/*
2 * This program source code file is part of KiCad, a free EDA CAD application.
3 *
4 * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <[email protected]>
5 * Copyright The KiCad Developers, see AUTHORS.txt for contributors.
6 *
7 * @author Dick Hollenbeck
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2
12 * of the License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, you may find one here:
21 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
22 * or you may search the http://www.gnu.org website for the version 2 license,
23 * or you may write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 */
26
27#ifndef UTF8_H_
28#define UTF8_H_
29
30#include <iostream>
31#include <string>
32#include <wx/string.h>
33
34#if defined(DEBUG)
35 #define UTF8_VERIFY // Might someday be a hidden cmake config option
36#endif
37
38
44bool IsUTF8( const char* aString );
45
46
47#if defined(UTF8_VERIFY)
48 #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
49#else
50 #define MAYBE_VERIFY_UTF8(x) // nothing
51#endif
52
53
70class UTF8
71{
72public:
73 UTF8( const wxString& o );
74
77 UTF8( const char* txt ) :
78 m_s( txt )
79 {
81 }
82
85 UTF8( const wchar_t* txt );
86
87 UTF8( const std::string& o ) :
88 m_s( o )
89 {
91 }
92
93 UTF8( const UTF8& o ) :
94 m_s( o.m_s )
95 {
97 }
98
100 {
101 }
102
103 ~UTF8() // Needed mainly to build python wrapper
104 {
105 }
106
107 // expose some std::string functions publicly, since base class must be private.
108 const char* c_str() const { return m_s.c_str(); }
109 bool empty() const { return m_s.empty(); }
110
111 std::string::size_type find( char c ) const { return m_s.find( c ); }
112 std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); }
113
114 void clear() { m_s.clear(); }
115 std::string::size_type length() const { return m_s.length(); }
116 std::string::size_type size() const { return m_s.size(); }
117 int compare( const std::string& s ) const { return m_s.compare( s ); }
118
119 bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
120 bool operator==( const std::string& rhs ) const { return m_s == rhs; }
121 bool operator==( const char* s ) const { return m_s == s; }
122
123 bool operator!=( const UTF8& rhs ) const { return !( operator==( rhs ) ); }
124 bool operator<( const UTF8& rhs ) const { return m_s < rhs.m_s; }
125 bool operator>( const UTF8& rhs ) const { return m_s > rhs.m_s; }
126
127 std::string::size_type find_first_of( const std::string& str,
128 std::string::size_type pos = 0 ) const
129 {
130 return m_s.find_first_of( str, pos );
131 }
132
133 UTF8& operator+=( const UTF8& str )
134 {
135 m_s += str.m_s;
137 return *this;
138 }
139
140 UTF8& operator+=( char ch )
141 {
142 m_s.operator+=( ch );
144 return *this;
145 }
146
147 UTF8& operator+=( const char* s )
148 {
149 m_s.operator+=( s );
151 return *this;
152 }
153
157 UTF8& operator+=( unsigned w_ch );
158
159 // std::string::npos is not constexpr, so we can't use it in an
160 // initializer.
161 static constexpr std::string::size_type npos = -1;
162
163 UTF8& operator=( const wxString& o );
164
165 UTF8& operator=( const std::string& o )
166 {
167 m_s = o;
169 return *this;
170 }
171
172 UTF8& operator=( const char* s )
173 {
174 m_s = s;
176 return *this;
177 }
178
179 UTF8& operator=( char c )
180 {
181 m_s = c;
183 return *this;
184 }
185
186 UTF8& operator=( const UTF8& aOther )
187 {
188 m_s = aOther.m_s;
190 return *this;
191 }
192
193 // Move assignment operator
194 UTF8& operator=( UTF8&& aOther ) noexcept
195 {
196 if (this != &aOther)
197 m_s = std::move( aOther.m_s );
198
200 return *this;
201 }
202
203 // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
204 // was split, so return std::string not UTF8
205 std::string substr( size_t pos = 0, size_t len = npos ) const
206 {
207 return m_s.substr( pos, len );
208 }
209
210 operator const std::string& () const { return m_s; }
211 //operator std::string& () { return m_s; }
212 //operator std::string () const { return m_s; }
213
214 wxString wx_str() const;
215 operator wxString () const;
216
217 // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
218 // over UTF8 (multi-byte) characters
219 std::string::const_iterator begin() const { return m_s.begin(); }
220 std::string::const_iterator end() const { return m_s.end(); }
221
222#ifndef SWIG
231 {
232 public:
233 uni_iter() // Needed only to build python wrapper, not used outside the wrapper
234 {
235 it = nullptr;
236 }
237
238 uni_iter( const uni_iter& o )
239 {
240 it = o.it;
241 }
242
245 {
246 it += uni_forward( it );
247 return *this;
248 }
249
252 {
253 uni_iter ret = *this;
254
255 it += uni_forward( it );
256 return ret;
257 }
258
260 unsigned operator->() const
261 {
262 unsigned result;
263
264 // grab the result, do not advance
265 uni_forward( it, &result );
266 return result;
267 }
268
270 unsigned operator*() const
271 {
272 unsigned result;
273
274 // grab the result, do not advance
275 uni_forward( it, &result );
276 return result;
277 }
278
279 uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
280
281 bool operator==( const uni_iter& other ) const { return it == other.it; }
282 bool operator!=( const uni_iter& other ) const { return it != other.it; }
283
286 bool operator< ( const uni_iter& other ) const { return it < other.it; }
287 bool operator<=( const uni_iter& other ) const { return it <= other.it; }
288 bool operator> ( const uni_iter& other ) const { return it > other.it; }
289 bool operator>=( const uni_iter& other ) const { return it >= other.it; }
290
291 private:
292 friend class UTF8;
293
294 const unsigned char* it;
295
296 // private constructor
297 uni_iter( const char* start ) :
298 it( (const unsigned char*) start )
299 {
300 }
301 };
302
307 {
308 return uni_iter( m_s.data() );
309 }
310
315 {
316 return uni_iter( m_s.data() + m_s.size() );
317 }
318
327 static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );
328#endif // SWIG
329
330protected:
331 std::string m_s;
332};
333
334
335std::ostream& operator<<( std::ostream& aStream, const UTF8& aRhs );
336
337#endif // UTF8_H_
uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded string...
Definition utf8.h:231
uni_iter(const uni_iter &o)
Definition utf8.h:238
unsigned operator*() const
return unicode at current position
Definition utf8.h:270
bool operator!=(const uni_iter &other) const
Definition utf8.h:282
uni_iter(const char *start)
Definition utf8.h:297
friend class UTF8
Definition utf8.h:292
const unsigned char * it
Definition utf8.h:294
bool operator<(const uni_iter &other) const
Since the ++ operators advance more than one byte, this is your best loop termination test,...
Definition utf8.h:286
uni_iter operator-(int aVal) const
Definition utf8.h:279
bool operator>=(const uni_iter &other) const
Definition utf8.h:289
bool operator<=(const uni_iter &other) const
Definition utf8.h:287
bool operator==(const uni_iter &other) const
Definition utf8.h:281
bool operator>(const uni_iter &other) const
Definition utf8.h:288
uni_iter operator++(int)
post-increment and return uni_iter at initial position
Definition utf8.h:251
const uni_iter & operator++()
pre-increment and return uni_iter at new position
Definition utf8.h:244
unsigned operator->() const
return unicode at current position
Definition utf8.h:260
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition utf8.h:71
UTF8 & operator+=(const UTF8 &str)
Definition utf8.h:133
static constexpr std::string::size_type npos
Definition utf8.h:161
std::string::const_iterator begin() const
Definition utf8.h:219
UTF8 & operator=(UTF8 &&aOther) noexcept
Definition utf8.h:194
UTF8 & operator=(const std::string &o)
Definition utf8.h:165
std::string::size_type length() const
Definition utf8.h:115
std::string m_s
Definition utf8.h:331
std::string::size_type find(char c) const
Definition utf8.h:111
void clear()
Definition utf8.h:114
~UTF8()
Definition utf8.h:103
UTF8 & operator=(const UTF8 &aOther)
Definition utf8.h:186
bool empty() const
Definition utf8.h:109
bool operator!=(const UTF8 &rhs) const
Definition utf8.h:123
std::string substr(size_t pos=0, size_t len=npos) const
Definition utf8.h:205
std::string::size_type find(char c, size_t s) const
Definition utf8.h:112
uni_iter uend() const
Return a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition utf8.h:314
bool operator==(const std::string &rhs) const
Definition utf8.h:120
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition utf8.cpp:67
UTF8(const std::string &o)
Definition utf8.h:87
UTF8(const UTF8 &o)
Definition utf8.h:93
UTF8 & operator+=(const char *s)
Definition utf8.h:147
int compare(const std::string &s) const
Definition utf8.h:117
std::string::size_type find_first_of(const std::string &str, std::string::size_type pos=0) const
Definition utf8.h:127
UTF8 & operator+=(char ch)
Definition utf8.h:140
UTF8(const char *txt)
This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault...
Definition utf8.h:77
const char * c_str() const
Definition utf8.h:108
bool operator<(const UTF8 &rhs) const
Definition utf8.h:124
std::string::size_type size() const
Definition utf8.h:116
wxString wx_str() const
Definition utf8.cpp:45
UTF8 & operator=(char c)
Definition utf8.h:179
uni_iter ubegin() const
Returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition utf8.h:306
UTF8 & operator=(const char *s)
Definition utf8.h:172
bool operator==(const char *s) const
Definition utf8.h:121
UTF8(const wxString &o)
Definition utf8.cpp:39
std::string::const_iterator end() const
Definition utf8.h:220
UTF8 & operator=(const wxString &o)
Definition utf8.cpp:57
bool operator==(const UTF8 &rhs) const
Definition utf8.h:119
UTF8()
Definition utf8.h:99
bool operator>(const UTF8 &rhs) const
Definition utf8.h:125
wxString result
Test unit parsing edge cases and error handling.
#define MAYBE_VERIFY_UTF8(x)
Definition utf8.h:50
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition utf8.cpp:179
std::ostream & operator<<(std::ostream &aStream, const UTF8 &aRhs)
Definition utf8.cpp:244