InDesign SDK  20.5
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
StringUtils.h
1 //========================================================================================
2 //
3 // $File$
4 //
5 // Owner: Paul Sorrick
6 //
7 // $Author$
8 //
9 // $DateTime$
10 //
11 // $Revision$
12 //
13 // $Change$
14 //
15 // Copyright 1997-2010 Adobe Systems Incorporated. All rights reserved.
16 //
17 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance
18 // with the terms of the Adobe license agreement accompanying it. If you have received
19 // this file from a source other than Adobe, then your use, modification, or
20 // distribution of it requires the prior written permission of Adobe.
21 //
22 //========================================================================================
23 #ifndef __STRINGUTILS__
24 #define __STRINGUTILS__
25 
26 #include "IPMUnknown.h"
27 #include "PMString.h"
28 #include "WideString.h"
29 #include "EscapeTable.h"
30 #include <algorithm>
31 #include <utility>
32 #include "K2Pair.h"
33 #include <boost/range/iterator_range.hpp>
34 
35 // Forward declarations
36 class TextCharFilter;
37 class IPMStream;
38 
39 
40 namespace StringUtils
41 {
42 
43  // Array containing the replacement strings.
44  typedef K2Vector<WideString> ArgArray;
45 
55  void FormatPositionalArgs(WideString& formatString, ArgArray const& args);
56 
57 
58  // String parameter replacement - NOTE: you should have translated aString before passing it
59  // to this routine. It will be marked as non-translating by this routine.
60  // Note that more than one occurrence of a placeholder (^1, ^2, ^3, ^4, ^5, ^6, ^7) can occur and
61  // all will be replaced with the provided string.
62  void ReplaceStringParameters(
63  PMString *aString, // Input and output string that contains up to four different placeholders
64  const PMString& parm1, // String to replace ^1 with, may be empty
65  const PMString& parm2 = kNullString, // String to replace ^2 with, may be empty
66  const PMString& parm3 = kNullString, // String to replace ^3 with, may be empty
67  const PMString& parm4 = kNullString, // String to replace ^4 with, may be empty
68  const PMString& parm5 = kNullString, // String to replace ^5 with, may be empty
69  const PMString& parm6 = kNullString, // String to replace ^6 with, may be empty
70  const PMString& parm7 = kNullString // String to replace ^7 with, may be empty
71  );
72 
73  // Input UTF16 String, Returns a PMString Translated to a UTF8 String
74  void ConvertWideStringToUTF8 (const WideString & str, std::string & returnval );
75 
76  // An iterator range specifies a sequence of characters
77  typedef boost::iterator_range<const unsigned char*> UTF8CharRange;
78 
79  // Helper function - constructs an UTF8CharRange from a std::string
80  // This will be removed when ConvertUTF8ToWideString will be a template and it will work with any type of iterators (not only pointers)
81  inline UTF8CharRange MakeUTF8CharRange(const std::string& str)
82  {
83  const unsigned char* b((const unsigned char*)(str.c_str()));
84  const unsigned char* e(b + str.size());
85  return UTF8CharRange(b, e);
86  }
87 
95  inline UTF8CharRange MakeUTF8CharRange(const char* utf8Buffer, size_t length)
96  {
97  const unsigned char* b((const unsigned char*)utf8Buffer);
98  const unsigned char* e(b + length);
99  return UTF8CharRange(b, e);
100  }
101 
102 
113  UTF8CharRange ConvertUTF8RangeToWideString (const UTF8CharRange& utf8Range, WideString & returnval) ;
114 
115  // Overload for convenience when you already have a std::string
116  // If you have raw pointers use the UTF8CharRange version (faster - avoids std::string construction)
117  inline UTF8CharRange ConvertUTF8ToWideString (const std::string& str, WideString & returnval)
118  {
119  return ConvertUTF8RangeToWideString(::StringUtils::MakeUTF8CharRange(str), returnval);
120  }
121 
122  // Functor that checks if the specified character is in the range [0x20, 0x7E]
123  // This range does not include the control characters (like '\t' or 'n') which are below 0x20.
124  struct IsPrintableASCII : public std::function<bool (char) >
125  {
126  inline bool operator()(char c) const
127  {
128  // Check if c is within the range [0x20-0x7E]
129  return (c > 0x1F && c < 0x7F);
130  }
131  };
132 
133  void Reverse(WideString& w);
134 
135  // std::back_insert_iterator doesn't work with our non-standard containers
136  // Optimized insert iterator that saves on re-allocations
137  template<class Container>
139  : public std::iterator<std::output_iterator_tag, void, void, void, void>
140  {
141  public:
142  explicit back_insert_iterator(Container& cont)
143  : fContainer(cont)
144  {
145  }
146 
148  {
149  if ((fContainer.capacity() - fContainer.size()) < 2)
150  {
151  grow_capacity();
152  }
153 
154  // push value into container
155  fContainer.push_back(val);
156  return (*this);
157  }
158 
160  {
161  return (*this);
162  }
163 
164  back_insert_iterator<Container>& operator++()
165  {
166  return (*this);
167  }
168 
169  back_insert_iterator<Container> operator++(int)
170  {
171  return (*this);
172  }
173 
175  {
176  return (*this);
177  }
178 
179  protected:
180 
181  // Grows the capacity of the container by a factor of 1.5
182  // We want to have at least two empty slots available at the end.
183  void grow_capacity(void)
184  {
185  size_t newSize(1);
186  if (fContainer.capacity())
187  {
188  // Multiply by 1.5
189  newSize = fContainer.capacity() + (fContainer.capacity() >> 1);
190  }
191  fContainer.reserve(++newSize);
192  }
193 
194  Container& fContainer;
195  };
196 
199 
200  // Generic back_inserter
201 
202  template<class Container>
203  inline back_insert_iterator<Container> back_inserter(Container& cont)
204  {
206  }
207 
218  template <typename InputIt, typename OutputIt, typename CharType, typename EscapeType>
219  OutputIt Escape( InputIt srcBeg, InputIt srcEnd, OutputIt destBeg,
220  EscapeTable<CharType, EscapeType> const& mappingTable )
221  {
222  typedef typename EscapeTable<CharType, EscapeType>::EscapeMap MapType;
223  typedef typename EscapeTable<CharType, EscapeType>::EscapeMapKey MapKeyType;
224  typedef typename EscapeTable<CharType, EscapeType>::EscapeMapValue MapValueType;
225 
226  MapType const& searchMap = mappingTable.GetEscapeMap();
227  ASSERT( !searchMap.empty() );
228 
229  for ( ; srcBeg != srcEnd; ++srcBeg )
230  {
231  MapKeyType key(*srcBeg);
232  typename MapType::const_iterator i = searchMap.find( key );
233  if ( i != searchMap.end() )
234  {
235  destBeg = std::copy( i->second.begin(), i->second.end(), destBeg );
236  }
237  else
238  {
239  *destBeg++ = *srcBeg;
240  }
241  }
242  return destBeg;
243  }
244 
245 
259  template <typename InputIt, typename OutputIt, typename CharType, typename EscapeType>
260  OutputIt Unescape( InputIt srcBeg, InputIt srcEnd, OutputIt destBeg,
261  EscapeTable<CharType, EscapeType> const& mappingTable, bool16* pFoundInvalidEscape = nil )
262  {
263  typedef typename EscapeTable<CharType, EscapeType>::UnescapeMap MapType;
264  typedef typename EscapeTable<CharType, EscapeType>::UnescapeMapKey MapKeyType;
265  typedef typename EscapeTable<CharType, EscapeType>::UnescapeMapValue MapValueType;
266 
267  if ( pFoundInvalidEscape )
268  {
269  *pFoundInvalidEscape = kFalse;
270  }
271 
272  MapType const& searchMap = mappingTable.GetUnescapeMap();
273  ASSERT( !searchMap.empty() );
274 
275  MapKeyType const& escapeString = searchMap.begin()->first;
276 
277  typename MapKeyType::value_type escapeChar = escapeString[0];
278  size_t escapeLen = escapeString.size();
279 
280  while ( srcBeg != srcEnd )
281  {
282  // Is it an escape char?
283  if ( *srcBeg == escapeChar )
284  {
285  // Do we have enough chars left to be considered an escape sequence?
286  if ( escapeLen <= (size_t)std::distance(srcBeg, srcEnd) )
287  {
288  //MapKeyType key(srcBeg, srcBeg + escapeLen);
289  MapKeyType key;
290  std::copy(srcBeg, srcBeg + (int32)escapeLen, std::back_inserter(key) );
291  typename MapType::const_iterator i = searchMap.find( key );
292  if ( i != searchMap.end() )
293  {
294  // Replace with the found character
295  *destBeg++ = i->second;
296  std::advance(srcBeg, escapeLen);
297  // Continue with next char
298  continue;
299  }
300  }
301 
302  // We either found an invalid combination or a truncated escape sequence
303  // Signal that to the caller if it cares
304  if ( pFoundInvalidEscape )
305  {
306  *pFoundInvalidEscape = kTrue;
307  }
308  }
309 
310  // Copy the char to the output unchanged
311  *destBeg++ = *srcBeg++;
312  }
313 
314  return destBeg;
315  }
316 
317  // Some predefined tables; these are implemented in stringutils.cpp
318 
319  // Table for escaping control characters ( '\r', '\n', etc)
320  // This table should be used for finding control characters and display them to the UI.
321  extern EscapeTable<uint32, char> const kSearchableCtrlCharsTable;
322 
323  // This table should be used for replacing control characters.
324  // It contains only control chars that are safe to replace in the actual document.
325  extern EscapeTable<uint32, char> const kReplaceableCtrlCharsTable;
326 
327  // This table should be used for replacing control characters that do not include meta meaning.
328  // It contains only control chars that are safe to replace in the actual document and excludes
329  // characters such as page number.
330  extern EscapeTable<uint32, char> const kReplaceableNoMetaCtrlCharsTable;
331 
332  extern EscapeTable<uint32, char> const kSearchableGrepCtrlCharsTable;
333  extern EscapeTable<uint32, char> const kReplaceableGrepCtrlCharsTable;
334 
335  extern EscapeTable<uint32, char> const kReplaceableXRefFormatCtrlCharsTable;
336 
337  // This table is used for replacing those special chars to HTML version.
338  // extern EscapeTable<uint32, char> const kReplaceableHTMLCharsTable;
339 
340  // Predefined XML entities that should be recognized by any XML parser
341  extern EscapeTable<uint32, char> const kXMLPredefinedEntitiesCharsTable;
342 
343  // Wraps together the C-array and it's size
344  typedef K2Pair<textchar const*, size_t> CharArray;
345 
346  // Filter for commonly ignored chars.
347  extern CharArray const kIgnoredCharsFilterArray;
348 
349  // Filter for special control chars
350  extern CharArray const kSpecialControlCharsFilterArray;
351 
352  // Filter for whitespace (and similar) chars.
353  extern CharArray const kWhitespaceCharsFilterArray;
354 
355  // Filter for special/variable width whitespace (e.g. CR/LF, tabs, indent to here, etc) chars.
356  extern CharArray const kSpecialWhitespaceCharsFilterArray;
357 
358  // Filter for proxy chars (e.g. page numbers, footnotes, inlines, etc).
359  extern CharArray const kProxyCharsFilterArray;
360 
361  // Filter for illegal chars within calculated text (e.g. proxyChars + specialWhitespace + tables-related chars).
362  extern CharArray const kCalcTextIllegalCharsFilterArray;
363 
364  enum StripType
365  {
367  kStripLeadingChars = 0,
369  kStripTrailingChars,
371  kStripLeadingAndTrailingChars,
372  };
373 
380  void StripBeginEndChars( WideString& text, CharArray const& charArray, ::StringUtils::StripType stripType);
381 
388  template<class PR> void StripBeginEndChars( WideString& text, PR filter, ::StringUtils::StripType stripType)
389  {
390  // remove leading characters
391  if (stripType == ::StringUtils::kStripLeadingAndTrailingChars || stripType == ::StringUtils::kStripLeadingChars)
392  {
393  WideString::const_iterator newBegin = text.begin();
394  for (; newBegin != text.end(); ++newBegin)
395  {
396  if (!filter(*newBegin))
397  break;
398  }
399  text.erase(text.begin(), newBegin);
400  }
401 
402  // remove trailing characters
403  if (stripType == ::StringUtils::kStripLeadingAndTrailingChars || stripType == ::StringUtils::kStripTrailingChars)
404  {
405  WideString::const_reverse_iterator rNewBegin = text.rbegin();
406  for (; rNewBegin != text.rend(); ++rNewBegin)
407  {
408  const UTF32TextChar c = *rNewBegin;
409  if (!filter(c))
410  break;
411  }
412 
413  text.erase(rNewBegin.base(), text.end());
414  }
415  }
416 
423  void FilterString( WideString& text, CharArray const& charArray);
424 
432  template<class PR> void FilterString( WideString& text, PR filter)
433  {
434  WideString::iterator_raw newEnd = std::remove_if(text.begin_raw(), text.end_raw(), filter);
435  if ( newEnd != text.end_raw() )
436  {
437  // text.erase(newEnd, text.end());
438  int32 pos = newEnd - text.begin_raw();
439  int32 len = text.end_raw() - newEnd;
440  text.remove_raw(pos, len);
441  }
442  }
443 
454  WideString ConvertFromUIString(const PMString& ps, EscapeTable<uint32, char> const *escapeTable, const CharArray* stripArray);
455 
463  PMString ConvertToUIString(const WideString& ws, EscapeTable<uint32, char> const *escapeTable);
464 
473  UTF16TextChar* ConvertInt32ToUTF16(int32 i, UTF16TextChar* wBuffBegin, UTF16TextChar* wBuffEnd);
474 
476  // used by PMString::AppendNumber and PMString::AsNumber.
477 
478  // @param i the number to Convert
479  // @param wBuffBegin the start of the output range
480  // @param wBuffEnd the end of the output range
481  // @return pointer to trailing null UTF16TextChar* or wBuffBegin if (wBuffEnd - wBuffBegin) is too small
482  // */
483  UTF16TextChar* ConvertInt64ToUTF16(int64 i, UTF16TextChar* wBuffBegin, UTF16TextChar* wBuffEnd);
484 
493  UTF16TextChar* ConvertUInt32ToUTF16(uint32 i, UTF16TextChar* wBuffBegin, UTF16TextChar* wBuffEnd);
494 
503  UTF16TextChar* ConvertUInt64ToUTF16(uint64 i, UTF16TextChar* wBuffBegin, UTF16TextChar* wBuffEnd);
504 
510  std::string ReadStdString(IPMStream *s);
511 
517  void WriteStdString(IPMStream *s, std::string writeString);
518 
527  void TokenizeByWhitespace(const WideString& ws, std::vector<WideString>& tokens);
528 
533  void NormalizeDigits(PMString *stringToNormalize);
534 
540  PMString NormalizeUnicodeString(const PMString &instring);
541 
542 }
543 
544  void ReplaceStringParameters(PMString *aString,const PMString& parm1,const PMString& parm2 = kNullString,
545  const PMString& parm3 = kNullString,const PMString& parm4 = kNullString,const PMString& parm5 = kNullString,
546  const PMString& parm6 = kNullString,const PMString& parm7 = kNullString);
547 
548 
549 
550 
558 #endif