xoreos  0.0.5
streamtokenizer.cpp
Go to the documentation of this file.
1 /* xoreos - A reimplementation of BioWare's Aurora engine
2  *
3  * xoreos is the legal property of its developers, whose names
4  * can be found in the AUTHORS file distributed with this source
5  * distribution.
6  *
7  * xoreos is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 3
10  * of the License, or (at your option) any later version.
11  *
12  * xoreos is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with xoreos. If not, see <http://www.gnu.org/licenses/>.
19  */
20 
25 #include <cassert>
26 
28 #include "src/common/readstream.h"
29 #include "src/common/error.h"
30 
31 namespace Common {
32 
33 StreamTokenizer::StreamTokenizer(ConsecutiveSeparatorRule conSepRule) : _conSepRule(conSepRule) {
34 }
35 
36 bool StreamTokenizer::isIn(uint32 c, const std::list<uint32> &list) {
37  for (std::list<uint32>::const_iterator it = list.begin(); it != list.end(); ++it)
38  if (*it == c)
39  return true;
40 
41  return false;
42 }
43 
45  assert(!isIn(c, _separators) && !isIn(c, _quotes) && !isIn(c, _chunkEnds) && !isIn(c, _ignores));
46 
47  _separators.push_back(c);
48 }
49 
51  assert(!isIn(c, _separators) && !isIn(c, _quotes) && !isIn(c, _chunkEnds) && !isIn(c, _ignores));
52 
53  _quotes.push_back(c);
54 }
55 
57  assert(!isIn(c, _separators) && !isIn(c, _quotes) && !isIn(c, _chunkEnds) && !isIn(c, _ignores));
58 
59  _chunkEnds.push_back(c);
60 }
61 
63  assert(!isIn(c, _separators) && !isIn(c, _quotes) && !isIn(c, _chunkEnds) && !isIn(c, _ignores));
64 
65  _ignores.push_back(c);
66 }
67 
69  bool chunkEnd = false;
70  bool inQuote = false;
71  uint32 separator = 0xFFFFFFFF;
72 
73  uint32 c;
74  UString token;
75 
76  /* Run through the stream, character by character, checking their
77  * "character classes" and collecting characters for a token. */
78  while ((c = stream.readChar()) != ReadStream::kEOF) {
79  // Character classes
80  const bool isSeparatorChar = isIn(c, _separators);
81  const bool isQuoteChar = isIn(c, _quotes);
82  const bool isChunkEndChar = isIn(c, _chunkEnds);
83  const bool isIgnoreChar = isIn(c, _ignores);
84 
85  /* Handle ignored characters.
86  *
87  * All characters in the ignored characters list will be ignored
88  * completely. They will never be added to the token.
89  */
90  if (isIgnoreChar)
91  continue;
92 
93  /* Handle quote characters.
94  *
95  * A quote character toggles the "we're in quotes state". Any
96  * character that's found while in this state will be added to
97  * the token, even if it is a separator or chunk end character.
98  */
99  if (isQuoteChar) {
100  inQuote = !inQuote;
101  continue;
102  }
103 
104  if (inQuote) {
105  token += c;
106  continue;
107  }
108 
109  /* Handle chunk end characters.
110  *
111  * When we've reached the end of the chunk, seek back by one
112  * character, so that the stream is positioned right before
113  * the chunk end characters. Then break to stop collecting.
114  */
115  if (isChunkEndChar) {
117  chunkEnd = true;
118  break;
119  }
120 
121  /* Handle separator characters.
122  *
123  * When we've found a separator character, remember which it was
124  * (we will need it to check if we should skip following separators).
125  * Then break to stop collecting.
126  */
127  if (isSeparatorChar) {
128  separator = c;
129  break;
130  }
131 
132  /* At this point, we have a character that's not in any of the
133  * special character classes and is not in quotes. This is a normal
134  * character we'll just add to the token. Then we'll continue
135  * with the next character.
136  */
137 
138  token += c;
139  }
140 
141  /* Since we're technically operating on streams of arbitrary binary data,
142  * we might have collected \0 characters. Cut off the token at that point.
143  */
144  Common::UString::iterator nullChar = token.findFirst('\0');
145  if (nullChar != token.end())
146  token.truncate(nullChar);
147 
148  /* If we stopped collecting at a chunk end, there's nothing left to do.
149  * Just return the token.
150  */
151  if (chunkEnd)
152  return token;
153 
154  /* However, if we stopped collecting at a separator see if we should skip
155  * following consecutive separators.
156  *
157  * Depending on the value ConsecutiveSeparatorRule, there's different ways
158  * to go about this:
159  * - kRuleHeed: Never skip consecutive separators
160  * - kRuleIgnoreSame: Ignore all consecutive separators that are the same
161  * - kRuleIgnoreAll: Ignore all consecutive separators
162  *
163  * So we're going to consume characters out of the stream until either:
164  * - we've reached a character that is not a separator
165  * - the rule says we shouldn't skip this separator
166  *
167  * In either case, the stream is positioned right after the last separator
168  * that should be skipped.
169  */
170  if (_conSepRule != kRuleHeed) {
171  while ((c = stream.readChar()) != ReadStream::kEOF) {
172  const bool isSeparator = isIn(c, _separators);
173 
174  bool shouldSkip = isSeparator;
175  if ((_conSepRule == kRuleIgnoreSame) && (c != separator))
176  shouldSkip = false;
177 
178  if (!shouldSkip) {
180  break;
181  }
182  }
183  }
184 
185  // Finally, we can return the token
186  return token;
187 }
188 
189 size_t StreamTokenizer::getTokens(SeekableReadStream &stream, std::vector<UString> &list,
190  size_t min, size_t max, const UString &def) {
191 
192  assert(max >= min);
193 
194  list.clear();
195  list.reserve(min);
196 
197  size_t realTokenCount = 0;
198  while (!isChunkEnd(stream) && (realTokenCount < max)) {
199  UString token = getToken(stream);
200 
201  if (!token.empty() || (_conSepRule != kRuleIgnoreAll)) {
202  list.push_back(token);
203  realTokenCount++;
204  }
205  }
206 
207  while (list.size() < min)
208  list.push_back(def);
209 
210  return realTokenCount;
211 }
212 
214  uint32 c;
215  while ((c = stream.readChar()) != ReadStream::kEOF) {
216  if (!isIn(c, _separators) && !(isIn(c, _ignores))) {
218  break;
219  }
220  }
221 }
222 
224  while (n-- > 0)
225  UString token = getToken(stream);
226 }
227 
229  assert(!_chunkEnds.empty());
230 
231  uint32 c;
232  while ((c = stream.readChar()) != ReadStream::kEOF) {
233  if (isIn(c, _chunkEnds)) {
235  break;
236  }
237  }
238 }
239 
241  skipChunk(stream);
242 
243  uint32 c = stream.readChar();
244  if (c == ReadStream::kEOF)
245  return;
246 
247  if (!isIn(c, _chunkEnds))
249 }
250 
252  uint32 c = stream.readChar();
253  if (c == ReadStream::kEOF)
254  return true;
255 
256  bool chunkEnd = isIn(c, _chunkEnds);
257 
259 
260  return chunkEnd;
261 }
262 
263 } // End of namespace Common
Definition: 2dafile.h:39
A class holding an UTF-8 string.
Definition: ustring.h:48
virtual size_t seek(ptrdiff_t offset, Origin whence=kOriginBegin)=0
Sets the stream position indicator for the stream.
bool isChunkEnd(SeekableReadStream &stream)
UString getToken(SeekableReadStream &stream)
Parse a token out of the stream.
void truncate(const iterator &it)
Definition: ustring.cpp:343
void addChunkEnd(uint32 c)
Add a character marking the end of a chunk.
static const uint32 kEOF
Return value for end-of-file.
Definition: readstream.h:67
iterator findFirst(uint32 c) const
Definition: ustring.cpp:261
static bool isIn(uint32 c, const std::list< uint32 > &list)
Basic exceptions to throw.
utf8::iterator< std::string::const_iterator > iterator
Definition: ustring.h:50
bool empty() const
Is the string empty?
Definition: ustring.cpp:245
std::list< uint32 > _quotes
void addIgnore(uint32 c)
Add a character to ignore.
StreamTokenizer(ConsecutiveSeparatorRule conSepRule=kRuleHeed)
Basic reading stream interfaces.
std::list< uint32 > _chunkEnds
Seek from the current position of the stream.
Definition: readstream.h:270
void findFirstToken(SeekableReadStream &stream)
Find the first token character, skipping past separators.
size_t getTokens(SeekableReadStream &stream, std::vector< UString > &list, size_t min=0, size_t max=SIZE_MAX, const UString &def="")
Parse tokens out of the stream.
std::list< uint32 > _ignores
ConsecutiveSeparatorRule _conSepRule
uint32_t uint32
Definition: types.h:204
ConsecutiveSeparatorRule
What to do when consecutive separator are found.
Parse tokens out of a stream.
void nextChunk(SeekableReadStream &stream)
Skip past end of chunk characters.
Ignore all repeated separators.
uint32 readChar()
Reads the next character from stream and returns it as an unsigned char cast to an uint32...
Definition: readstream.h:108
void skipChunk(SeekableReadStream &stream)
Skip to the end of the chunk.
void addSeparator(uint32 c)
Add a character on where to split tokens.
iterator end() const
Definition: ustring.cpp:257
void skipToken(SeekableReadStream &stream, size_t n=1)
Skip a number of tokens.
std::list< uint32 > _separators
Ignore the repeated separator, but only if it&#39;s the same.
void addQuote(uint32 c)
Add a character able to enclose (quote) separators and chunk ends.
Interface for a seekable & readable data stream.
Definition: readstream.h:265