KiCad PCB EDA Suite
utf8.h
Go to the documentation of this file.
1 /*
2  * This program source code file is part of KiCad, a free EDA CAD application.
3  *
4  * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
5  * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
6  *
7  * @author Dick Hollenbeck
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License
11  * as published by the Free Software Foundation; either version 2
12  * of the License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, you may find one here:
21  * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
22  * or you may search the http://www.gnu.org website for the version 2 license,
23  * or you may write to the Free Software Foundation, Inc.,
24  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25  */
26 
27 #ifndef UTF8_H_
28 #define UTF8_H_
29 
30 #include <string>
31 #include <wx/string.h>
32 
33 #if defined(DEBUG)
34  #define UTF8_VERIFY // Might someday be a hidden cmake config option
35 #endif
36 
37 
43 bool IsUTF8( const char* aString );
44 
45 
46 #if defined(UTF8_VERIFY)
47  #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
48 #else
49  #define MAYBE_VERIFY_UTF8(x) // nothing
50 #endif
51 
52 
70 class UTF8
71 {
72 public:
73  UTF8( const wxString& o );
74 
77  UTF8( const char* txt ) :
78  m_s( txt )
79  {
81  }
82 
85  UTF8( const wchar_t* txt );
86 
87  UTF8( const std::string& o ) :
88  m_s( o )
89  {
91  }
92 
93  UTF8()
94  {
95  }
96 
97  ~UTF8() // Needed mainly to build python wrapper
98  {
99  }
100 
101  // expose some std::string functions publicly, since base class must be private.
102  const char* c_str() const { return m_s.c_str(); }
103  bool empty() const { return m_s.empty(); }
104 
105  std::string::size_type find( char c ) const { return m_s.find( c ); }
106  std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); }
107 
108  void clear() { m_s.clear(); }
109  std::string::size_type length() const { return m_s.length(); }
110  std::string::size_type size() const { return m_s.size(); }
111  int compare( const std::string& s ) const { return m_s.compare( s ); }
112 
113  bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
114  bool operator==( const std::string& rhs ) const { return m_s == rhs; }
115  bool operator==( const char* s ) const { return m_s == s; }
116 
117  std::string::size_type find_first_of( const std::string& str,
118  std::string::size_type pos = 0 ) const
119  {
120  return m_s.find_first_of( str, pos );
121  }
122 
123  UTF8& operator+=( const UTF8& str )
124  {
125  m_s += str.m_s;
127  return *this;
128  }
129 
130  UTF8& operator+=( char ch )
131  {
132  m_s.operator+=( ch );
134  return *this;
135  }
136 
137  UTF8& operator+=( const char* s )
138  {
139  m_s.operator+=( s );
141  return *this;
142  }
143 
147  UTF8& operator+=( unsigned w_ch );
148 
149  // std::string::npos is not constexpr, so we can't use it in an
150  // initializer.
151  static constexpr std::string::size_type npos = -1;
152 
153  UTF8& operator=( const wxString& o );
154 
155  UTF8& operator=( const std::string& o )
156  {
157  m_s = o;
159  return *this;
160  }
161 
162  UTF8& operator=( const char* s )
163  {
164  m_s = s;
166  return *this;
167  }
168 
169  UTF8& operator=( char c )
170  {
171  m_s = c;
173  return *this;
174  }
175 
176  // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
177  // was split, so return std::string not UTF8
178  std::string substr( size_t pos = 0, size_t len = npos ) const
179  {
180  return m_s.substr( pos, len );
181  }
182 
183  operator const std::string& () const { return m_s; }
184  //operator std::string& () { return m_s; }
185  //operator std::string () const { return m_s; }
186 
187  wxString wx_str() const;
188  operator wxString () const;
189 
190  // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
191  // over UTF8 (multi-byte) characters
192  std::string::const_iterator begin() const { return m_s.begin(); }
193  std::string::const_iterator end() const { return m_s.end(); }
194 
195 #ifndef SWIG
196 
203  class uni_iter
204  {
205  public:
206  uni_iter() // Needed only to build python wrapper, not used outside the wrapper
207  {
208  it = nullptr;
209  }
210 
211  uni_iter( const uni_iter& o )
212  {
213  it = o.it;
214  }
215 
218  {
219  it += uni_forward( it );
220  return *this;
221  }
222 
225  {
226  uni_iter ret = *this;
227 
228  it += uni_forward( it );
229  return ret;
230  }
231 
233  unsigned operator->() const
234  {
235  unsigned result;
236 
237  // grab the result, do not advance
238  uni_forward( it, &result );
239  return result;
240  }
241 
243  unsigned operator*() const
244  {
245  unsigned result;
246 
247  // grab the result, do not advance
248  uni_forward( it, &result );
249  return result;
250  }
251 
252  uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
253 
254  bool operator==( const uni_iter& other ) const { return it == other.it; }
255  bool operator!=( const uni_iter& other ) const { return it != other.it; }
256 
259  bool operator< ( const uni_iter& other ) const { return it < other.it; }
260  bool operator<=( const uni_iter& other ) const { return it <= other.it; }
261  bool operator> ( const uni_iter& other ) const { return it > other.it; }
262  bool operator>=( const uni_iter& other ) const { return it >= other.it; }
263 
264  private:
265  friend class UTF8;
266 
267  const unsigned char* it;
268 
269  // private constructor
270  uni_iter( const char* start ) :
271  it( (const unsigned char*) start )
272  {
273  }
274  };
275 
279  uni_iter ubegin() const
280  {
281  return uni_iter( m_s.data() );
282  }
283 
287  uni_iter uend() const
288  {
289  return uni_iter( m_s.data() + m_s.size() );
290  }
291 
300  static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );
301 #endif // SWIG
302 
303 protected:
304  std::string m_s;
305 };
306 
307 
308 #endif // UTF8_H_
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition: utf8.h:70
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:58
static constexpr std::string::size_type npos
Definition: utf8.h:151
int compare(const std::string &s) const
Definition: utf8.h:111
bool operator==(const UTF8 &rhs) const
Definition: utf8.h:113
uni_iter(const uni_iter &o)
Definition: utf8.h:211
bool operator>(const uni_iter &other) const
Definition: utf8.h:261
uni_iter uend() const
Return a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition: utf8.h:287
bool operator==(const std::string &rhs) const
Definition: utf8.h:114
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition: utf8.cpp:180
UTF8()
Definition: utf8.h:93
const char * c_str() const
Definition: utf8.h:102
unsigned operator->() const
return unicode at current position
Definition: utf8.h:233
uni_iter ubegin() const
Returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition: utf8.h:279
std::string::size_type find_first_of(const std::string &str, std::string::size_type pos=0) const
Definition: utf8.h:117
std::string::size_type find(char c) const
Definition: utf8.h:105
std::string::size_type find(char c, size_t s) const
Definition: utf8.h:106
UTF8 & operator=(const std::string &o)
Definition: utf8.h:155
UTF8(const char *txt)
This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault...
Definition: utf8.h:77
std::string::size_type length() const
Definition: utf8.h:109
std::string m_s
Definition: utf8.h:304
unsigned operator *() const
return unicode at current position
Definition: utf8.h:243
bool operator>=(const uni_iter &other) const
Definition: utf8.h:262
UTF8 & operator=(const char *s)
Definition: utf8.h:162
const uni_iter & operator++()
pre-increment and return uni_iter at new position
Definition: utf8.h:217
UTF8 & operator+=(const char *s)
Definition: utf8.h:137
UTF8 & operator+=(const UTF8 &str)
Definition: utf8.h:123
bool operator==(const char *s) const
Definition: utf8.h:115
uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded string...
Definition: utf8.h:203
UTF8 & operator=(char c)
Definition: utf8.h:169
bool operator==(const uni_iter &other) const
Definition: utf8.h:254
uni_iter(const char *start)
Definition: utf8.h:270
bool operator<(const uni_iter &other) const
Since the ++ operators advance more than one byte, this is your best loop termination test,...
Definition: utf8.h:259
UTF8 & operator+=(char ch)
Definition: utf8.h:130
std::string::const_iterator end() const
Definition: utf8.h:193
uni_iter operator++(int)
post-increment and return uni_iter at initial position
Definition: utf8.h:224
UTF8(const std::string &o)
Definition: utf8.h:87
wxString wx_str() const
Definition: utf8.cpp:46
uni_iter operator-(int aVal) const
Definition: utf8.h:252
bool operator<=(const uni_iter &other) const
Definition: utf8.h:260
std::string::const_iterator begin() const
Definition: utf8.h:192
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:178
const unsigned char * it
Definition: utf8.h:267
bool operator!=(const uni_iter &other) const
Definition: utf8.h:255
std::string::size_type size() const
Definition: utf8.h:110
void clear()
Definition: utf8.h:108
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition: utf8.cpp:68
~UTF8()
Definition: utf8.h:97
#define MAYBE_VERIFY_UTF8(x)
Definition: utf8.h:49
bool empty() const
Definition: utf8.h:103