InDesign SDK  20.5
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
WideString.h
1 //========================================================================================
2 //
3 // $File$
4 //
5 // Owner: Nat McCully
6 //
7 // $Author$
8 //
9 // $DateTime$
10 //
11 // $Revision$
12 //
13 // $Change$
14 //
15 // Copyright 1997-2010 Adobe Systems Incorporated. All rights reserved.
16 //
17 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance
18 // with the terms of the Adobe license agreement accompanying it. If you have received
19 // this file from a source other than Adobe, then your use, modification, or
20 // distribution of it requires the prior written permission of Adobe.
21 //
22 //
23 // An UTF16-encoded string (16-bit code values).
24 //
25 // SEE USAGE EXAMPLES AT THE BOTTOM OF THIS FILE!
26 //
27 //========================================================================================
28 
29 #ifndef __WIDESTRING__
30 #define __WIDESTRING__
31 
32 #include "PMUtils.h"
33 #include "K2Iterator.h"
34 #include "K2TypeTraits.h"
35 #include "Invariant.h"
36 #include "UnicodeClass.h"
37 #include "K2SmartPtr.h"
38 
39 #include "Trace.h" // /public/includes/
40 
41 #include "UnicodeSavvyString.h"
42 
43 class PMString;
44 class IPMStream;
45 class TextIterator;
46 
47 
48 // SEE USAGE EXAMPLES AT THE BOTTOM OF THIS FILE!
49 
50 // Alias for an unicode code point as defined by the Unicode standard.
51 // Long story made short: UTF32TextChar was introduced because we wanted to provide
52 // some type safety. To anybody's indignation, the compilers will happily accept the following code:
53 //
54 // typedef uint32 UniCodePoint;
55 // UniCodePoint u(0xFFFFFFFF);
56 // char c = u;
57 //
58 // Not even a warning is generated. This intention of providing some basic type safety
59 // is honorable and we respect that. However, we run accros a big problem when trying
60 // to use the UTF32TextChar in conjuction with boost::tokenizer. The <xstring> header shipped
61 // with .NET 2003 compiler has a small stirng optimization defined like this:
62 //
63 // union _Bxty{
64 // _Elem _Buf[_BUF_SIZE];
65 // _Elem *_Ptr;
66 // } _Bx;
67 //
68 // Unfortunately, when instantiating the above template code _Elem template type to UTF32TextChar
69 // we get a compiler error because you can't put types that have constructors in the unions.
70 //
71 // Until we find a solution that will satisfy these two conflicting requirements, we will
72 // use this typedef for our Unicode code point type.
73 //
74 // BEWARE: DON'T MAKE ANY ASSUMPTIONS in your code about the size of a UniCodePoint.
75 // DO NOT assume that will always be 32-bit and don't cast it around. Just use it as it is.
76 //
77 typedef uint32 UniCodePoint;
78 
79 class WideString;
80 
89 {
90 public:
91 
92  // Iterator traits: not entirely correct, but work for now :)
93  typedef std::bidirectional_iterator_tag iterator_category;
94  typedef UniCodePoint value_type;
95  typedef std::ptrdiff_t difference_type;
96  typedef UTF16TextChar* pointer;
97  typedef UTF16TextChar& reference;
98 
99  WideStringConstUTF32Iter(const WideString *string, int32 charIndex = 0);
100 
101  WideStringConstUTF32Iter(const UTF16TextChar *buffer, bool16 hasSurrogates, int32 numChars) : fCurrent(buffer), fHasSurrogates(hasSurrogates)
102  {
103 #ifdef DEBUG
104  fNumChars = numChars;
105  fPosition = 0;
106 #else
107  numChars++; //make release happy
108 #endif
109  }
110 
115  const UTF16TextChar* PtrAt() const
116  { return fCurrent; }
117 
121  value_type operator*() const
122  {
123 #ifdef DEBUG
124  ASSERT(fPosition >= 0 && fPosition <= fNumChars);
125 #endif
126  return fHasSurrogates ? surro_operStar() : *fCurrent;
127  }
128 
132  { if (fHasSurrogates) surro_operPP();
133  else ++fCurrent;
134 #ifdef DEBUG
135  ++fPosition;
136  ASSERT(fPosition <= fNumChars); // can be one off end
137 #endif
138  return *this; }
139 
143  { WideStringConstUTF32Iter tmp(*this); ++(*this); return tmp; }
144 
148  { if (fHasSurrogates) surro_operMM();
149  else --fCurrent;
150 #ifdef DEBUG
151  --fPosition;
152  ASSERT(fPosition >= 0);
153 #endif
154  return *this; }
155 
159  { WideStringConstUTF32Iter tmp(*this); --(*this); return tmp; }
160 
161  WideStringConstUTF32Iter& operator+=(int32 n)
162  { if (fHasSurrogates) surro_operPE(n);
163  else {
164  fCurrent += n;
165 #ifdef DEBUG
166  fPosition += n;
167  }
168  ASSERT(fPosition <= fNumChars); // can be one off end
169 #else
170  }
171 #endif
172  return *this; }
173 
174  WideStringConstUTF32Iter operator+(int32 n) const
175  { return WideStringConstUTF32Iter(*this) += n; }
176 
177  WideStringConstUTF32Iter& operator-=(int32 n)
178  { if (fHasSurrogates) surro_operME(n);
179  else {
180  fCurrent -= n;
181 #ifdef DEBUG
182  fPosition -= n;
183  }
184  ASSERT(fPosition >= 0);
185 #else
186  }
187 #endif
188  return *this; }
189 
190  WideStringConstUTF32Iter operator-(int32 n) const
191  { return WideStringConstUTF32Iter(*this) -= n; }
192 
193  int32 operator-(const WideStringConstUTF32Iter& other) const
194  { return fHasSurrogates ? surro_operDiff(other) : fCurrent - other.fCurrent; }
195 
196  UTF32TextChar operator[](int32 i) const
197  { WideStringConstUTF32Iter tmp(*this); tmp += i; return *tmp; }
198 
199  friend WideStringConstUTF32Iter operator+(int32 n, const WideStringConstUTF32Iter& rhs)
200  { return WideStringConstUTF32Iter(rhs) += n; }
201 
202  friend bool operator==(const WideStringConstUTF32Iter& x, const WideStringConstUTF32Iter& y)
203  { return x.fCurrent == y.fCurrent; }
204 
205  friend bool operator!=(const WideStringConstUTF32Iter& x, const WideStringConstUTF32Iter& y)
206  { return x.fCurrent != y.fCurrent; }
207 
208  friend bool operator<(const WideStringConstUTF32Iter& x, const WideStringConstUTF32Iter& y)
209  { return x.fCurrent < y.fCurrent; }
210 
211  friend bool operator<=(const WideStringConstUTF32Iter& x, const WideStringConstUTF32Iter& y)
212  { return x.fCurrent <= y.fCurrent; }
213 
214  friend bool operator>(const WideStringConstUTF32Iter& x, const WideStringConstUTF32Iter& y)
215  { return x.fCurrent > y.fCurrent; }
216 
217  friend bool operator>=(const WideStringConstUTF32Iter& x, const WideStringConstUTF32Iter& y)
218  { return x.fCurrent >= y.fCurrent; }
219 
220 #ifdef DEBUG
221 
223  int32 Position() const
224  { return fPosition; }
225 
228  int32 NumChars() const
229  { return fNumChars; }
230 
233  int32& NumChars()
234  { return fNumChars; }
235 #endif
236 
237 private:
239  const UTF16TextChar * operator->() const { return fCurrent; }
240  UniCodePoint surro_operStar() const;
241  void surro_operPP();
242  void surro_operPE(int32 n);
243  void surro_operMM();
244  void surro_operME(int32 n);
245  int32 surro_operDiff(const WideStringConstUTF32Iter& other) const;
246 
247 private:
248  const UTF16TextChar *fCurrent;
249  bool16 fHasSurrogates;
250 #ifdef DEBUG
251  int32 fNumChars;
252  int32 fPosition;
253 #endif
254 };
255 
256 
257 
260 template <typename T>
261 int32 Strip_If(WideString& string, T f);
262 
267 {
268  public:
269  typedef object_type data_type;
270  typedef UTF16TextChar& reference_raw;
271  typedef const UTF16TextChar& const_reference_raw;
272  typedef UTF16TextChar value_type_raw;
273  typedef std::ptrdiff_t difference_type;
274  typedef UTF16TextChar* pointer_raw;
275  typedef const UTF16TextChar* const_pointer_raw;
276 
277  typedef UTF16TextChar* iterator_raw;
278  typedef const UTF16TextChar* const_iterator_raw;
281 
282  typedef UTF32TextChar& reference;
283  typedef const UTF32TextChar& const_reference;
284  typedef UTF32TextChar value_type;
285  typedef UTF32TextChar* pointer;
286  typedef const UTF32TextChar* const_pointer;
287 
288 // typedef WideStringUTF32Iter iterator;
291 
293 
294  WideString();
295 
296  explicit WideString(WideString::const_pointer_raw s, int32 len = kMaxInt32, int32 numChars = -1);
297 
298  WideString(const WideString::const_iterator& iter, int32 numChars);
299 
300  WideString(const WideString& w);
301 
302  WideString(WideString &&other) noexcept = default;
303 
304  explicit WideString(ConstCString string, int32 numChars = -1);
305 
306  explicit WideString(const PMString& s);
307 
308  explicit WideString(const wchar_t* s, int32 len = kMaxInt32); // wchar_t is a distinct type on CW
309 
318  template <class IteratorType>
319  WideString(IteratorType b, IteratorType e, size_type nCodePoints = 0)
321  {
322  assign(b, e, nCodePoints);
323  }
324 
327  WideString(adobe::move_from<WideString> other)
328  : UnicodeSavvyString(adobe::move_from<UnicodeSavvyString>(other.source))
329  {}
330 
333  ~WideString();
334 
338  void SetString(const WideString& s);
339  void SetString(const WideString::const_iterator& iter, int32 numChars);
340 
345  void SetCString(ConstCString C, bool16 convertEmbeddedUnicode = kFalse, int32 numChars = -1);
346 
347  void SetX16String(WideString::const_pointer_raw x, int32 len = kMaxInt32, int32 numChars = -1); // already 16 bit string
348  void SetX16String(const wchar_t* s, int32 len = kMaxInt32, int32 numChars = -1);
349 
363  iterator_raw begin_raw();
364  const_iterator_raw begin_raw() const;
365 
366 
367  iterator_raw end_raw();
368 
369  const_iterator_raw end_raw() const;
370 
371 
372  reverse_iterator_raw rbegin_raw()
373  { TRACE_IF(HasMultiWordUnicode(), "About to hand back non-const iterator on WideString's buffer! Be careful with surrogates!"); return reverse_iterator_raw(end_raw()); }
374  const_reverse_iterator_raw rbegin_raw() const
375  { return const_reverse_iterator_raw(end_raw()); }
376 
377  reverse_iterator_raw rend_raw()
378  { TRACE_IF(HasMultiWordUnicode(), "About to hand back non-const iterator on WideString's buffer! Be careful with surrogates!"); return reverse_iterator_raw(begin_raw()); }
379  const_reverse_iterator_raw rend_raw() const
380  { return const_reverse_iterator_raw(begin_raw()); }
381 
382 
388  { return const_iterator(this, 0); }
389 
390  const_iterator end() const
391  { return const_iterator(this, CharCount()); }
392 
393  const_reverse_iterator rbegin() const
394  { return const_reverse_iterator(end()); }
395 
396  const_reverse_iterator rend() const
397  { return const_reverse_iterator(begin()); }
398 
399  //---------------------------
400 
401 
402  int32 UTF16IndexToCodePointIndex(int32 index) const;
403 
404  //---------------------------
405 
406  bool16 empty() const
407  { return fUTF16BufferLength == 0; }
408 
409  void Clear()
410  { clear(); }
411 
414  int32 Length() const
415  { return CharCount(); }
416 
417  bool16 IsNull() const
418  { return empty(); }
419 
420  //---------CHAR LEVEL ROUTINES
421  UTF32TextChar GetChar(int32 pos) const
422  { return GetUTF32TextChar(pos); }
423  UTF32TextChar LastChar() const;
424  UTF32TextChar FirstChar() const;
425 
426  void push_back(WideString::value_type c)
427  { Append(c); }
428 
429  // these routines append only, no positioning provided (use insert)
430 
431  void Append(const WideString &s);
432 
433  void Append(const WideString::const_iterator& iter, int32 numChars)
434  { Append(iter.PtrAt(), (iter + numChars).PtrAt() - iter.PtrAt(), numChars); }
435 
436  void Append(WideString::const_pointer_raw buf, int32 n, int32 numChars = -1);
437 
445  WideString& append(WideString::const_pointer_raw s, size_type nCodeValues, size_type nCodePoints = 0)
446  {
447  UnicodeSavvyString::append(s, nCodeValues, nCodePoints);
448  return *this;
449  }
450 
461  {
462  return replace(b, e, s.begin(), s.end());
463  }
464 
477 
484  {
485  return replace(b, e, b, b);
486  }
487 
488  void Append(WideString::value_type c32)
489  { UnicodeSavvyString::AppendUTF32TextChar(c32); }
490 
491  // these routines insert count character of string s at position.
492  void Insert(const WideString &s, int32 position = 0, int32 count = kMaxInt32);
493  void Insert(WideString::const_pointer_raw buf, int32 len, int32 pos = 0)
494  { if (len > 0) UnicodeSavvyString::InsertUTF16String(buf, len, pos); }
495  void Insert(WideString::value_type c, int32 pos = 0)
496  { UnicodeSavvyString::InsertUTF32TextChar(c, pos); }
497 
498  WideString* Substring(int32 position, int32 count = kMaxInt32) const; // return string from position on, zero = rest
499 
507  WideString* GetItem(const WideString& delimiter, const int32 nItem) const; // return the item-th delimited token, one based
508 
509  void remove_raw(int32 utf16Pos, int32 utf16Count);
510  void RemoveCodePoints(int32 startCodePointIndex, int32 numCodePoints); // assumes legal arguments, doesn't call CountChars()
511 
512  int32 IndexOf(const WideString& keyString, int32 position = 0) const; // position of non-overlappin occurence of key string
513  int32 IndexOf(WideString::value_type c, int32 position = 0) const;
514 
515  bool16 Contains(const WideString& key, int32 pos = 0) const
516  { return IndexOf(key, pos) >= 0; }
517 
520  void Shrink(bool16 maxshrink = kFalse);
521 
522  //----------OPERATORS---------------
523 
526  {
527  UnicodeSavvyString::CopyFrom(other);
528  return *this;
529  }
530 
532  WideString& operator = (WideString &&other) noexcept
533  {
535  return *this;
536  }
537 
538  WideString& operator = (WideString::const_pointer_raw copy)
539  {
540  return assign(copy, copy + UTF16TextCharLength(copy));
541  }
542 
545  const UTF32TextChar operator[](int32 index) const
546  { return GetChar(index); }
547 
548  void SetChar(int32 index, WideString::value_type c32);
549 
550  int32 compare(const WideString& s) const;
551 
560  template <class IteratorType>
561  WideString& assign(IteratorType b, IteratorType e, size_type nCodePoints = 0)
562  {
563  typedef typename std::iterator_traits<IteratorType>::value_type iterator_value_type;
564 
565  // We accept iterators to UTF16 or UTF32 ranges only!
566  BOOST_STATIC_ASSERT(sizeof(iterator_value_type)*CHAR_BIT == 16 ||
567  sizeof(iterator_value_type)*CHAR_BIT == 32);
568 
569  // The code values should be unsigned values
570  BOOST_STATIC_ASSERT(!std::numeric_limits<iterator_value_type>::is_signed);
571 
572  // Dispatch to the correct impl (UTF16 or UTF32 source)
573  EncodingSelector<sizeof(iterator_value_type)*CHAR_BIT> encodingSel;
574 
575  WideString::assign_impl(b, e, nCodePoints, encodingSel);
576  return *this;
577  }
578 
585  inline WideString& assign(WideString::const_pointer_raw src, size_type nCodeValues, size_type nCodePoints = 0)
586  {
587  // Check for negative values
588  ASSERT_MSG(nCodeValues != static_cast<size_type>(-1), "-1 is not a valid value for nCodeValues");
589  ASSERT_MSG(nCodePoints != static_cast<size_type>(-1), "-1 is not a valid value for nCodePoints");
590 
591  return assign(src, src + nCodeValues, nCodePoints);
592  }
593 
594 
595  bool16 operator >= (const WideString &s) const
596  { return compare(s) >= 0; }
597  bool16 operator > (const WideString &s) const
598  { return compare(s) > 0; }
599  bool16 operator <= (const WideString &s) const
600  { return compare(s) <= 0; }
601  bool16 operator < (const WideString &s) const
602  { return compare(s) < 0; }
603 
604  bool16 operator == (const WideString &s) const;
605  bool16 operator != (const WideString &s) const
606  { return !(*this == s); }
607 
608  bool16 operator == (WideString::const_pointer_raw b) const;
609  bool16 operator != (const UTF16TextChar *b) const
610  { return !(*this == b); }
611 
612  WideString& operator +=(WideString::value_type c)
613  { UnicodeSavvyString::AppendUTF32TextChar(c);
614  return *this; }
615 
616  WideString& operator +=(const WideString& s)
617  { Append(s);
618  return *this; }
619 
620  //---------------MISC ROUTINES------------------------
623  uint32 Hash(void) const;
624  void ToLower();
625  void ToUpper();
626 
630  { return Strip_If(*this, [c](const auto& t){ return t==c;});}
631 
632  //----------------------------------------------------
633  // conversion routines.
634  // *** GetAsSystemString() is now obsolete. In order to convert a WideString
635  // *** into a PMString you can either:
636  // *** 1. Construct the PMString using a WideString argument.
637  // *** 2. Assign the WideString to the PMString using operator=.
638  // *** 3. Call GetSystemString().
639 // PMString* GetAsSystemString(void) const;
640  void BuildFromSystemString(const PMString& ss);
641  void GetSystemString( PMString *ss ) const;
642 
643  //------streaming functions---------
644  void ReadWrite(IPMStream* s);
645 
646  private:
647 
648  // Helper selector for assign() dispatch
649  template <unsigned int i> class EncodingSelector {};
650 
653  template <class IteratorType>
654  inline void assign_impl(IteratorType b, IteratorType e, size_type nCodePoints, EncodingSelector<16>)
655  {
656  // UnicodeSavvyString handles UTF16s
657  UnicodeSavvyString::assign(b, e, nCodePoints);
658  }
659 
662  template <class IteratorType>
663  void assign_impl(IteratorType b, IteratorType e, size_type nCodePoints, EncodingSelector<32>)
664  {
665  clear();
666 
667  // Try to optimize allocations in case the client told us the number of code points.
668  if (nCodePoints)
669  {
670  reserve(nCodePoints);
671  }
672 
673  // TODO: optimize this - it goes thru an unnecessary UTF32TextChar constructor
674  std::copy(b, e, std::back_inserter(*this));
675  }
676 
677 
678 #ifdef DEBUG
679  // ---- Data
680  mutable InvariantCount fInvariant;
681 public:
682  void Invariant() const;
683  // atomic updates and reads
684  static bool16 ts_ForceRoundtrip;
685 private:
686  bool16 ParseForMultiWordUnicode() const;
687 #endif
688 public:
689  static const boost::shared_ptr<WideString> kNil_shared_ptr;
690 };
691 
692 inline WideStringConstUTF32Iter::WideStringConstUTF32Iter(const WideString *string, int32 charIndex)
693 {
694  if (string)
695  {
696  fCurrent = string->GrabUTF16Buffer(nil) + string->CodePointIndexToUTF16Index(charIndex);
697  fHasSurrogates = string->HasMultiWordUnicode();
698 #ifdef DEBUG
699  fPosition = charIndex;
700  fNumChars = string->CharCount();
701 #endif
702  }
703  else
704  {
705  fCurrent = nil;
706  fHasSurrogates = kFalse;
707 #ifdef DEBUG
708  fPosition = 0;
709  fNumChars = 0;
710 #endif
711  }
712 }
713 
714 inline void swap(WideString& left, WideString& right) noexcept
715 {
716  // Base class swap
717  swap(static_cast<UnicodeSavvyString&>(left), static_cast<UnicodeSavvyString&>(right));
718 }
719 
722 template <typename T>
723 int32 Strip_If(WideString& string, T f)
724 {
725  const int32 originalNumChars = string.CharCount();
726  int32 num2Strip = 0;
727 
728  for (int32 strIndex = originalNumChars - 1; strIndex >= 0; --strIndex)
729  {
730  if (f(string.GetChar(strIndex)))
731  ++num2Strip;
732  else if (num2Strip)
733  {
734  string.RemoveCodePoints(strIndex+1, num2Strip); // start from the next one becaues we just found one NOT to strip
735  num2Strip = 0;
736 
737  }
738  }
739  // stripping from the beginning
740  if (num2Strip)
741  string.RemoveCodePoints(0, num2Strip); // start from the next one becaues we just found one NOT to strip
742 
743  return originalNumChars - string.CharCount();
744 }
745 
746 
747 inline WideString::const_iterator_raw ToRawIterator(const WideStringConstUTF32Iter& iter)
748 { return iter.PtrAt(); }
749 inline WideString::const_pointer_raw IterToPtr(const WideString::const_iterator_raw& i)
750 { return &(*i); }
751 inline int NumUTF16CodesFromCharacter(const WideString::value_type val)
752 { return (val > 0xFFFF ? 2 : 1); }
753 
754 
755 
759 template <>
760 inline WideString& WideString::assign<WideString::const_iterator>(WideString::const_iterator b, WideString::const_iterator e, size_type nCodePoints)
761 {
762  UnicodeSavvyString::assign(b.PtrAt(), e.PtrAt(), nCodePoints);
763  return *this;
764 }
765 
766 // specialize adobe::type_info<> to avoid typeid problems across DLL boundaries
767 ADOBE_NAME_TYPE_0("widestring:indesign:adobe",WideString);
768 
769 
828 #endif