KiCad PCB EDA Suite
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1/*
2 * This program source code file is part of KiCad, a free EDA CAD application.
3 *
4 * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <[email protected]>
5 * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
6 *
7 * @author Dick Hollenbeck
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2
12 * of the License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, you may find one here:
21 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
22 * or you may search the http://www.gnu.org website for the version 2 license,
23 * or you may write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 */
26
27#ifndef UTF8_H_
28#define UTF8_H_
29
30#include <iostream>
31#include <string>
32#include <wx/string.h>
33
34#if defined(DEBUG)
35 #define UTF8_VERIFY // Might someday be a hidden cmake config option
36#endif
37
38
44bool IsUTF8( const char* aString );
45
46
47#if defined(UTF8_VERIFY)
48 #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
49#else
50 #define MAYBE_VERIFY_UTF8(x) // nothing
51#endif
52
53
71class UTF8
72{
73public:
74 UTF8( const wxString& o );
75
78 UTF8( const char* txt ) :
79 m_s( txt )
80 {
82 }
83
86 UTF8( const wchar_t* txt );
87
88 UTF8( const std::string& o ) :
89 m_s( o )
90 {
92 }
93
95 {
96 }
97
98 ~UTF8() // Needed mainly to build python wrapper
99 {
100 }
101
102 // expose some std::string functions publicly, since base class must be private.
103 const char* c_str() const { return m_s.c_str(); }
104 bool empty() const { return m_s.empty(); }
105
106 std::string::size_type find( char c ) const { return m_s.find( c ); }
107 std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); }
108
109 void clear() { m_s.clear(); }
110 std::string::size_type length() const { return m_s.length(); }
111 std::string::size_type size() const { return m_s.size(); }
112 int compare( const std::string& s ) const { return m_s.compare( s ); }
113
114 bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
115 bool operator==( const std::string& rhs ) const { return m_s == rhs; }
116 bool operator==( const char* s ) const { return m_s == s; }
117
118 bool operator!=( const UTF8& rhs ) const { return !( operator==( rhs ) ); }
119 bool operator<( const UTF8& rhs ) const { return m_s < rhs.m_s; }
120 bool operator>( const UTF8& rhs ) const { return m_s > rhs.m_s; }
121
122 std::string::size_type find_first_of( const std::string& str,
123 std::string::size_type pos = 0 ) const
124 {
125 return m_s.find_first_of( str, pos );
126 }
127
128 UTF8& operator+=( const UTF8& str )
129 {
130 m_s += str.m_s;
132 return *this;
133 }
134
135 UTF8& operator+=( char ch )
136 {
137 m_s.operator+=( ch );
139 return *this;
140 }
141
142 UTF8& operator+=( const char* s )
143 {
144 m_s.operator+=( s );
146 return *this;
147 }
148
152 UTF8& operator+=( unsigned w_ch );
153
154 // std::string::npos is not constexpr, so we can't use it in an
155 // initializer.
156 static constexpr std::string::size_type npos = -1;
157
158 UTF8& operator=( const wxString& o );
159
160 UTF8& operator=( const std::string& o )
161 {
162 m_s = o;
164 return *this;
165 }
166
167 UTF8& operator=( const char* s )
168 {
169 m_s = s;
171 return *this;
172 }
173
174 UTF8& operator=( char c )
175 {
176 m_s = c;
178 return *this;
179 }
180
181 // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
182 // was split, so return std::string not UTF8
183 std::string substr( size_t pos = 0, size_t len = npos ) const
184 {
185 return m_s.substr( pos, len );
186 }
187
188 operator const std::string& () const { return m_s; }
189 //operator std::string& () { return m_s; }
190 //operator std::string () const { return m_s; }
191
192 wxString wx_str() const;
193 operator wxString () const;
194
195 // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
196 // over UTF8 (multi-byte) characters
197 std::string::const_iterator begin() const { return m_s.begin(); }
198 std::string::const_iterator end() const { return m_s.end(); }
199
200#ifndef SWIG
209 {
210 public:
211 uni_iter() // Needed only to build python wrapper, not used outside the wrapper
212 {
213 it = nullptr;
214 }
215
216 uni_iter( const uni_iter& o )
217 {
218 it = o.it;
219 }
220
223 {
224 it += uni_forward( it );
225 return *this;
226 }
227
230 {
231 uni_iter ret = *this;
232
233 it += uni_forward( it );
234 return ret;
235 }
236
238 unsigned operator->() const
239 {
240 unsigned result;
241
242 // grab the result, do not advance
243 uni_forward( it, &result );
244 return result;
245 }
246
248 unsigned operator*() const
249 {
250 unsigned result;
251
252 // grab the result, do not advance
253 uni_forward( it, &result );
254 return result;
255 }
256
257 uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
258
259 bool operator==( const uni_iter& other ) const { return it == other.it; }
260 bool operator!=( const uni_iter& other ) const { return it != other.it; }
261
264 bool operator< ( const uni_iter& other ) const { return it < other.it; }
265 bool operator<=( const uni_iter& other ) const { return it <= other.it; }
266 bool operator> ( const uni_iter& other ) const { return it > other.it; }
267 bool operator>=( const uni_iter& other ) const { return it >= other.it; }
268
269 private:
270 friend class UTF8;
271
272 const unsigned char* it;
273
274 // private constructor
275 uni_iter( const char* start ) :
276 it( (const unsigned char*) start )
277 {
278 }
279 };
280
285 {
286 return uni_iter( m_s.data() );
287 }
288
293 {
294 return uni_iter( m_s.data() + m_s.size() );
295 }
296
305 static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );
306#endif // SWIG
307
308protected:
309 std::string m_s;
310};
311
312
313std::ostream& operator<<( std::ostream& aStream, const UTF8& aRhs );
314
315#endif // UTF8_H_
uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded string...
Definition: utf8.h:209
uni_iter(const uni_iter &o)
Definition: utf8.h:216
unsigned operator*() const
return unicode at current position
Definition: utf8.h:248
bool operator!=(const uni_iter &other) const
Definition: utf8.h:260
uni_iter(const char *start)
Definition: utf8.h:275
const unsigned char * it
Definition: utf8.h:272
bool operator<(const uni_iter &other) const
Since the ++ operators advance more than one byte, this is your best loop termination test,...
Definition: utf8.h:264
uni_iter operator-(int aVal) const
Definition: utf8.h:257
bool operator>=(const uni_iter &other) const
Definition: utf8.h:267
bool operator<=(const uni_iter &other) const
Definition: utf8.h:265
bool operator==(const uni_iter &other) const
Definition: utf8.h:259
bool operator>(const uni_iter &other) const
Definition: utf8.h:266
uni_iter operator++(int)
post-increment and return uni_iter at initial position
Definition: utf8.h:229
const uni_iter & operator++()
pre-increment and return uni_iter at new position
Definition: utf8.h:222
unsigned operator->() const
return unicode at current position
Definition: utf8.h:238
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition: utf8.h:72
UTF8 & operator+=(const UTF8 &str)
Definition: utf8.h:128
static constexpr std::string::size_type npos
Definition: utf8.h:156
std::string::const_iterator begin() const
Definition: utf8.h:197
UTF8 & operator=(const std::string &o)
Definition: utf8.h:160
std::string::size_type length() const
Definition: utf8.h:110
std::string m_s
Definition: utf8.h:309
std::string::size_type find(char c) const
Definition: utf8.h:106
void clear()
Definition: utf8.h:109
~UTF8()
Definition: utf8.h:98
bool empty() const
Definition: utf8.h:104
bool operator!=(const UTF8 &rhs) const
Definition: utf8.h:118
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:183
std::string::size_type find(char c, size_t s) const
Definition: utf8.h:107
uni_iter uend() const
Return a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition: utf8.h:292
bool operator==(const std::string &rhs) const
Definition: utf8.h:115
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition: utf8.cpp:67
UTF8(const std::string &o)
Definition: utf8.h:88
UTF8 & operator+=(const char *s)
Definition: utf8.h:142
int compare(const std::string &s) const
Definition: utf8.h:112
std::string::size_type find_first_of(const std::string &str, std::string::size_type pos=0) const
Definition: utf8.h:122
UTF8 & operator+=(char ch)
Definition: utf8.h:135
UTF8(const char *txt)
This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault...
Definition: utf8.h:78
const char * c_str() const
Definition: utf8.h:103
bool operator<(const UTF8 &rhs) const
Definition: utf8.h:119
std::string::size_type size() const
Definition: utf8.h:111
wxString wx_str() const
Definition: utf8.cpp:45
UTF8 & operator=(char c)
Definition: utf8.h:174
uni_iter ubegin() const
Returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition: utf8.h:284
UTF8 & operator=(const char *s)
Definition: utf8.h:167
bool operator==(const char *s) const
Definition: utf8.h:116
std::string::const_iterator end() const
Definition: utf8.h:198
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:57
bool operator==(const UTF8 &rhs) const
Definition: utf8.h:114
UTF8()
Definition: utf8.h:94
bool operator>(const UTF8 &rhs) const
Definition: utf8.h:120
#define MAYBE_VERIFY_UTF8(x)
Definition: utf8.h:50
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition: utf8.cpp:179
std::ostream & operator<<(std::ostream &aStream, const UTF8 &aRhs)
Definition: utf8.cpp:244