KiCad PCB EDA Suite
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1/*
2 * This program source code file is part of KiCad, a free EDA CAD application.
3 *
4 * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <[email protected]>
5 * Copyright The KiCad Developers, see AUTHORS.txt for contributors.
6 *
7 * @author Dick Hollenbeck
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2
12 * of the License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, you may find one here:
21 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
22 * or you may search the http://www.gnu.org website for the version 2 license,
23 * or you may write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 */
26
27#ifndef UTF8_H_
28#define UTF8_H_
29
30#include <iostream>
31#include <string>
32#include <wx/string.h>
33
34#if defined(DEBUG)
35 #define UTF8_VERIFY // Might someday be a hidden cmake config option
36#endif
37
38
44bool IsUTF8( const char* aString );
45
46
47#if defined(UTF8_VERIFY)
48 #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
49#else
50 #define MAYBE_VERIFY_UTF8(x) // nothing
51#endif
52
53
71class UTF8
72{
73public:
74 UTF8( const wxString& o );
75
78 UTF8( const char* txt ) :
79 m_s( txt )
80 {
82 }
83
86 UTF8( const wchar_t* txt );
87
88 UTF8( const std::string& o ) :
89 m_s( o )
90 {
92 }
93
94 UTF8( const UTF8& o ) :
95 m_s( o.m_s )
96 {
98 }
99
101 {
102 }
103
104 ~UTF8() // Needed mainly to build python wrapper
105 {
106 }
107
108 // expose some std::string functions publicly, since base class must be private.
109 const char* c_str() const { return m_s.c_str(); }
110 bool empty() const { return m_s.empty(); }
111
112 std::string::size_type find( char c ) const { return m_s.find( c ); }
113 std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); }
114
115 void clear() { m_s.clear(); }
116 std::string::size_type length() const { return m_s.length(); }
117 std::string::size_type size() const { return m_s.size(); }
118 int compare( const std::string& s ) const { return m_s.compare( s ); }
119
120 bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
121 bool operator==( const std::string& rhs ) const { return m_s == rhs; }
122 bool operator==( const char* s ) const { return m_s == s; }
123
124 bool operator!=( const UTF8& rhs ) const { return !( operator==( rhs ) ); }
125 bool operator<( const UTF8& rhs ) const { return m_s < rhs.m_s; }
126 bool operator>( const UTF8& rhs ) const { return m_s > rhs.m_s; }
127
128 std::string::size_type find_first_of( const std::string& str,
129 std::string::size_type pos = 0 ) const
130 {
131 return m_s.find_first_of( str, pos );
132 }
133
134 UTF8& operator+=( const UTF8& str )
135 {
136 m_s += str.m_s;
138 return *this;
139 }
140
141 UTF8& operator+=( char ch )
142 {
143 m_s.operator+=( ch );
145 return *this;
146 }
147
148 UTF8& operator+=( const char* s )
149 {
150 m_s.operator+=( s );
152 return *this;
153 }
154
158 UTF8& operator+=( unsigned w_ch );
159
160 // std::string::npos is not constexpr, so we can't use it in an
161 // initializer.
162 static constexpr std::string::size_type npos = -1;
163
164 UTF8& operator=( const wxString& o );
165
166 UTF8& operator=( const std::string& o )
167 {
168 m_s = o;
170 return *this;
171 }
172
173 UTF8& operator=( const char* s )
174 {
175 m_s = s;
177 return *this;
178 }
179
180 UTF8& operator=( char c )
181 {
182 m_s = c;
184 return *this;
185 }
186
187 UTF8& operator=( const UTF8& aOther )
188 {
189 m_s = aOther.m_s;
191 return *this;
192 }
193
194 // Move assignment operator
195 UTF8& operator=( UTF8&& aOther ) noexcept
196 {
197 if (this != &aOther)
198 m_s = std::move( aOther.m_s );
199
201 return *this;
202 }
203
204 // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
205 // was split, so return std::string not UTF8
206 std::string substr( size_t pos = 0, size_t len = npos ) const
207 {
208 return m_s.substr( pos, len );
209 }
210
211 operator const std::string& () const { return m_s; }
212 //operator std::string& () { return m_s; }
213 //operator std::string () const { return m_s; }
214
215 wxString wx_str() const;
216 operator wxString () const;
217
218 // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
219 // over UTF8 (multi-byte) characters
220 std::string::const_iterator begin() const { return m_s.begin(); }
221 std::string::const_iterator end() const { return m_s.end(); }
222
223#ifndef SWIG
232 {
233 public:
234 uni_iter() // Needed only to build python wrapper, not used outside the wrapper
235 {
236 it = nullptr;
237 }
238
239 uni_iter( const uni_iter& o )
240 {
241 it = o.it;
242 }
243
246 {
247 it += uni_forward( it );
248 return *this;
249 }
250
253 {
254 uni_iter ret = *this;
255
256 it += uni_forward( it );
257 return ret;
258 }
259
261 unsigned operator->() const
262 {
263 unsigned result;
264
265 // grab the result, do not advance
266 uni_forward( it, &result );
267 return result;
268 }
269
271 unsigned operator*() const
272 {
273 unsigned result;
274
275 // grab the result, do not advance
276 uni_forward( it, &result );
277 return result;
278 }
279
280 uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
281
282 bool operator==( const uni_iter& other ) const { return it == other.it; }
283 bool operator!=( const uni_iter& other ) const { return it != other.it; }
284
287 bool operator< ( const uni_iter& other ) const { return it < other.it; }
288 bool operator<=( const uni_iter& other ) const { return it <= other.it; }
289 bool operator> ( const uni_iter& other ) const { return it > other.it; }
290 bool operator>=( const uni_iter& other ) const { return it >= other.it; }
291
292 private:
293 friend class UTF8;
294
295 const unsigned char* it;
296
297 // private constructor
298 uni_iter( const char* start ) :
299 it( (const unsigned char*) start )
300 {
301 }
302 };
303
308 {
309 return uni_iter( m_s.data() );
310 }
311
316 {
317 return uni_iter( m_s.data() + m_s.size() );
318 }
319
328 static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );
329#endif // SWIG
330
331protected:
332 std::string m_s;
333};
334
335
336std::ostream& operator<<( std::ostream& aStream, const UTF8& aRhs );
337
338#endif // UTF8_H_
uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded string...
Definition: utf8.h:232
uni_iter(const uni_iter &o)
Definition: utf8.h:239
unsigned operator*() const
return unicode at current position
Definition: utf8.h:271
bool operator!=(const uni_iter &other) const
Definition: utf8.h:283
uni_iter(const char *start)
Definition: utf8.h:298
const unsigned char * it
Definition: utf8.h:295
bool operator<(const uni_iter &other) const
Since the ++ operators advance more than one byte, this is your best loop termination test,...
Definition: utf8.h:287
uni_iter operator-(int aVal) const
Definition: utf8.h:280
bool operator>=(const uni_iter &other) const
Definition: utf8.h:290
bool operator<=(const uni_iter &other) const
Definition: utf8.h:288
bool operator==(const uni_iter &other) const
Definition: utf8.h:282
bool operator>(const uni_iter &other) const
Definition: utf8.h:289
uni_iter operator++(int)
post-increment and return uni_iter at initial position
Definition: utf8.h:252
const uni_iter & operator++()
pre-increment and return uni_iter at new position
Definition: utf8.h:245
unsigned operator->() const
return unicode at current position
Definition: utf8.h:261
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition: utf8.h:72
UTF8 & operator+=(const UTF8 &str)
Definition: utf8.h:134
static constexpr std::string::size_type npos
Definition: utf8.h:162
std::string::const_iterator begin() const
Definition: utf8.h:220
UTF8 & operator=(UTF8 &&aOther) noexcept
Definition: utf8.h:195
UTF8 & operator=(const std::string &o)
Definition: utf8.h:166
std::string::size_type length() const
Definition: utf8.h:116
std::string m_s
Definition: utf8.h:332
std::string::size_type find(char c) const
Definition: utf8.h:112
void clear()
Definition: utf8.h:115
~UTF8()
Definition: utf8.h:104
UTF8 & operator=(const UTF8 &aOther)
Definition: utf8.h:187
bool empty() const
Definition: utf8.h:110
bool operator!=(const UTF8 &rhs) const
Definition: utf8.h:124
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:206
std::string::size_type find(char c, size_t s) const
Definition: utf8.h:113
uni_iter uend() const
Return a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition: utf8.h:315
bool operator==(const std::string &rhs) const
Definition: utf8.h:121
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition: utf8.cpp:67
UTF8(const std::string &o)
Definition: utf8.h:88
UTF8(const UTF8 &o)
Definition: utf8.h:94
UTF8 & operator+=(const char *s)
Definition: utf8.h:148
int compare(const std::string &s) const
Definition: utf8.h:118
std::string::size_type find_first_of(const std::string &str, std::string::size_type pos=0) const
Definition: utf8.h:128
UTF8 & operator+=(char ch)
Definition: utf8.h:141
UTF8(const char *txt)
This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault...
Definition: utf8.h:78
const char * c_str() const
Definition: utf8.h:109
bool operator<(const UTF8 &rhs) const
Definition: utf8.h:125
std::string::size_type size() const
Definition: utf8.h:117
wxString wx_str() const
Definition: utf8.cpp:45
UTF8 & operator=(char c)
Definition: utf8.h:180
uni_iter ubegin() const
Returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition: utf8.h:307
UTF8 & operator=(const char *s)
Definition: utf8.h:173
bool operator==(const char *s) const
Definition: utf8.h:122
std::string::const_iterator end() const
Definition: utf8.h:221
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:57
bool operator==(const UTF8 &rhs) const
Definition: utf8.h:120
UTF8()
Definition: utf8.h:100
bool operator>(const UTF8 &rhs) const
Definition: utf8.h:126
std::ostream & operator<<(std::ostream &aStream, const EDA_TEXT &aText)
Definition: eda_text.cpp:1295
#define MAYBE_VERIFY_UTF8(x)
Definition: utf8.h:50
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition: utf8.cpp:179