blob: 0f076f89087d86ff0afb4fd48ad1cf0654adaee4 [file] [log] [blame]
Alexandre Lisionddd731e2014-01-31 11:50:08 -05001// Copyright (C) 1999-2005 Open Source Telecom Corporation.
2// Copyright (C) 2006-2010 David Sugar, Tycho Softworks.
3//
4// This program is free software; you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation; either version 2 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program; if not, write to the Free Software
16// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17//
18// As a special exception, you may use this file as part of a free software
19// library without restriction. Specifically, if other files instantiate
20// templates or use macros or inline functions from this file, or you compile
21// this file and link it with other files to produce an executable, this
22// file does not by itself cause the resulting executable to be covered by
23// the GNU General Public License. This exception does not however
24// invalidate any other reasons why the executable file might be covered by
25// the GNU General Public License.
26//
27// This exception applies only to the code released under the name GNU
28// Common C++. If you copy code from other releases into a copy of GNU
29// Common C++, as the General Public License permits, the exception does
30// not apply to the code that you add in this way. To avoid misleading
31// anyone as to the status of such modified files, you must delete
32// this exception notice from them.
33//
34// If you write modifications of your own for GNU Common C++, it is your choice
35// whether to permit this exception to apply to your modifications.
36// If you do not wish that, delete this exception notice.
37//
38
39/**
40 * @file tokenizer.h
41 * @short string tokenizer.
42 **/
43
44#ifndef COMMONCPP_TOKENIZER_H_
45#define COMMONCPP_TOKENIZER_H_
46
47#ifndef COMMONCPP_CONFIG_H_
48#include <commoncpp/config.h>
49#endif
50
51#ifndef COMMONCPP_THREAD_H_
52#include <commoncpp/thread.h>
53#endif
54
55#ifndef COMMMONCPP_EXCEPTION_H_
56#include <commoncpp/exception.h>
57#endif
58
59NAMESPACE_COMMONCPP
60
61/**
62 * Splits delimited string into tokens.
63 *
64 * The StringTokenizer takes a pointer to a string and a pointer
65 * to a string containing a number of possible delimiters.
66 * The StringTokenizer provides an input forward iterator which allows
67 * to iterate through all tokens. An iterator behaves like a logical
68 * pointer to the tokens, i.e. to shift to the next token, you've
69 * to increment the iterator, you get the token by dereferencing the
70 * iterator.
71 *
72 * Memory consumption:
73 * This class operates on the original string and only allocates memory
74 * for the individual tokens actually requested, so this class
75 * allocates at maximum the space required for the longest token in the
76 * given string.
77 * Since for each iteration, memory is reclaimed for the last token,
78 * you MAY NOT store pointers to them; if you need them afterwards,
79 * copy them. You may not modify the original string while you operate
80 * on it with the StringTokenizer; the behaviour is undefined in that
81 * case.
82 *
83 * The iterator has one special method 'nextDelimiter()' which returns
84 * a character containing the next delimiter following this
85 * tokenization process or '\\0', if there are no following delimiters. In
86 * case of skipAllDelim, it returns the FIRST delimiter.
87 *
88 * With the method 'setDelimiters(const char*)' you may change the
89 * set of delimiters. It affects all running iterators.
90 *
91 * Example:
92 * <code><pre>
93 * StringTokenizer st("mary had a little lamb;its fleece was..", " ;");
94 * StringTokenizer::iterator i;
95 * for (i = st.begin() ; i != st.end() ; ++i) {
96 * cout << "Token: '" << *i << "'\t";
97 * cout << " next Delim: '" << i.nextDelimiter() << "'" << endl;
98 * }
99 * </pre></code>
100 *
101 * @author Henner Zeller <H.Zeller@acm.org>
102 * @license LGPL
103 */
104class __EXPORT StringTokenizer {
105public:
106 /**
107 * a delimiter string containing all usual whitespace delimiters.
108 * These are space, tab, newline, carriage return,
109 * formfeed and vertical tab. (see isspace() manpage).
110 */
111 static const char * const SPACE;
112
113 /**
114 * Exception thrown, if someone tried to read beyond the
115 * end of the tokens.
116 * Will not happen if you use it the 'clean' way with comparison
117 * against end(), but if you skip some tokens, because you 'know'
118 * they are there. Simplifies error handling a lot, since you can
119 * just read your tokens the way you expect it, and if there is some
120 * error in the input this Exception will be thrown.
121 */
122 // maybe move more global ?
123 class NoSuchElementException { };
124
125 /**
126 * The input forward iterator for tokens.
127 * @author Henner Zeller
128 */
129 class __EXPORT iterator {
130 friend class StringTokenizer; // access our private constructors
131 private:
132 const StringTokenizer *myTok; // my StringTokenizer
133 const char *start; // start of current token
134 const char *tokEnd; // end of current token (->nxDelimiter)
135 const char *endp; // one before next token
136 char *token; // allocated token, if requested
137
138 // for initialization of the itEnd iterator
139 iterator(const StringTokenizer &tok, const char *end)
140 : myTok(&tok),tokEnd(0),endp(end),token(0) {}
141
142 iterator(const StringTokenizer &tok)
143 : myTok(&tok),tokEnd(0),endp(myTok->str-1),token(0) {
144 ++(*this); // init first token.
145 }
146
147 public:
148 iterator() : myTok(0),start(0),tokEnd(0),endp(0),token(0) {}
149
150 // see also: comment in implementation of operator++
151 virtual ~iterator()
152 { if (token) *token='\0'; delete [] token; }
153
154 /**
155 * copy constructor.
156 */
157 // everything, but not responsible for the allocated token.
158 iterator(const iterator& i) :
159 myTok(i.myTok),start(i.start),tokEnd(i.tokEnd),
160 endp(i.endp),token(0) {}
161
162 /**
163 * assignment operator.
164 */
165 // everything, but not responsible for the allocated token.
166 iterator &operator = (const iterator &i)
167 {
168 myTok = i.myTok;
169 start = i.start; endp = i.endp; tokEnd = i.tokEnd;
170 if ( token )
171 delete [] token;
172 token = 0;
173 return *this;
174 }
175
176 /**
177 * shifts this iterator to the next token in the string.
178 */
179 iterator &operator ++ () THROWS (NoSuchElementException);
180
181 /**
182 * returns the immutable string this iterator
183 * points to or '0' if no token is available (i.e.
184 * i == end()).
185 * Do not store pointers to this token, since it is
186 * invalidated for each iteration. If you need the token,
187 * copy it (e.g. with strdup());
188 */
189 const char* operator * () THROWS (NoSuchElementException);
190
191 /**
192 * returns the next delimiter after the current token or
193 * '\\0', if there are no following delimiters.
194 * It returns the very next delimiter (even if
195 * skipAllDelim=true).
196 */
197 inline char nextDelimiter() const
198 {return (tokEnd) ? *tokEnd : '\0';}
199
200 /**
201 * compares to other iterator. Usually used to
202 * compare against the end() iterator.
203 */
204 // only compare the end-position. speed.
205 inline bool operator == (const iterator &other) const
206 {return (endp == other.endp);}
207
208 /**
209 * compares to other iterator. Usually used to
210 * compare against the end() iterator.
211 */
212 // only compare the end position. speed.
213 inline bool operator != (const iterator &other) const
214 {return (endp != other.endp);}
215 };
216private:
217 friend class StringTokenizer::iterator;
218 const char *str;
219 const char *delim;
220 bool skipAll, trim;
221 iterator itEnd;
222
223public:
224 /**
225 * creates a new StringTokenizer for a string
226 * and a given set of delimiters.
227 *
228 * @param str String to be split up. This string will
229 * not be modified by this StringTokenizer,
230 * but you may as well not modfiy this string
231 * while tokenizing is in process, which may
232 * lead to undefined behaviour.
233 *
234 * @param delim String containing the characters
235 * which should be regarded as delimiters.
236 *
237 * @param skipAllDelim OPTIONAL.
238 * true, if subsequent
239 * delimiters should be skipped at once
240 * or false, if empty tokens should
241 * be returned for two delimiters with
242 * no other text inbetween. The first
243 * behaviour may be desirable for whitespace
244 * skipping, the second for input with
245 * delimited entry e.g. /etc/passwd like files
246 * or CSV input.
247 * NOTE, that 'true' here resembles the
248 * ANSI-C strtok(char *s,char *d) behaviour.
249 * DEFAULT = false
250 *
251 * @param trim OPTIONAL.
252 * true, if the tokens returned
253 * should be trimmed, so that they don't have
254 * any whitespaces at the beginning or end.
255 * Whitespaces are any of the characters
256 * defined in StringTokenizer::SPACE.
257 * If delim itself is StringTokenizer::SPACE,
258 * this will result in a behaviour with
259 * skipAllDelim = true.
260 * DEFAULT = false
261 */
262 StringTokenizer (const char *str,
263 const char *delim,
264 bool skipAllDelim = false,
265 bool trim = false);
266
267 /**
268 * create a new StringTokenizer which splits the input
269 * string at whitespaces. The tokens are stripped from
270 * whitespaces. This means, if you change the set of
271 * delimiters in either the 'begin(const char *delim)' method
272 * or in 'setDelimiters()', you then get whitespace
273 * trimmed tokens, delimited by the new set.
274 * Behaves like StringTokenizer(s, StringTokenizer::SPACE,false,true);
275 */
276 StringTokenizer (const char *s);
277
278 /**
279 * returns the begin iterator
280 */
281 iterator begin() const
282 {return iterator(*this);}
283
284 /**
285 * changes the set of delimiters used in subsequent
286 * iterations.
287 */
288 void setDelimiters (const char *d)
289 {delim = d;}
290
291 /**
292 * returns a begin iterator with an alternate set of
293 * delimiters.
294 */
295 iterator begin(const char *d)
296 {
297 delim = d;
298 return iterator(*this);
299 }
300
301 /**
302 * the iterator marking the end.
303 */
304 const iterator& end() const
305 {return itEnd;}
306};
307
308END_NAMESPACE
309
310#endif
311
312/** EMACS **
313 * Local variables:
314 * mode: c++
315 * c-basic-offset: 4
316 * End:
317 */