Tristan Matthews | 0461646 | 2013-11-14 16:09:34 -0500 | [diff] [blame] | 1 | // Copyright (c) 2005, Google Inc. |
| 2 | // All rights reserved. |
| 3 | // |
| 4 | // Redistribution and use in source and binary forms, with or without |
| 5 | // modification, are permitted provided that the following conditions are |
| 6 | // met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright |
| 9 | // notice, this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above |
| 11 | // copyright notice, this list of conditions and the following disclaimer |
| 12 | // in the documentation and/or other materials provided with the |
| 13 | // distribution. |
| 14 | // * Neither the name of Google Inc. nor the names of its |
| 15 | // contributors may be used to endorse or promote products derived from |
| 16 | // this software without specific prior written permission. |
| 17 | // |
| 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | // |
| 30 | // Author: Sanjay Ghemawat |
| 31 | // |
| 32 | // Regular-expression based scanner for parsing an input stream. |
| 33 | // |
| 34 | // Example 1: parse a sequence of "var = number" entries from input: |
| 35 | // |
| 36 | // Scanner scanner(input); |
| 37 | // string var; |
| 38 | // int number; |
| 39 | // scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter |
| 40 | // while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { |
| 41 | // ...; |
| 42 | // } |
| 43 | |
| 44 | #ifndef _PCRE_SCANNER_H |
| 45 | #define _PCRE_SCANNER_H |
| 46 | |
| 47 | #include <assert.h> |
| 48 | #include <string> |
| 49 | #include <vector> |
| 50 | |
| 51 | #include <pcrecpp.h> |
| 52 | #include <pcre_stringpiece.h> |
| 53 | |
| 54 | namespace pcrecpp { |
| 55 | |
| 56 | class PCRECPP_EXP_DEFN Scanner { |
| 57 | public: |
| 58 | Scanner(); |
| 59 | explicit Scanner(const std::string& input); |
| 60 | ~Scanner(); |
| 61 | |
| 62 | // Return current line number. The returned line-number is |
| 63 | // one-based. I.e. it returns 1 + the number of consumed newlines. |
| 64 | // |
| 65 | // Note: this method may be slow. It may take time proportional to |
| 66 | // the size of the input. |
| 67 | int LineNumber() const; |
| 68 | |
| 69 | // Return the byte-offset that the scanner is looking in the |
| 70 | // input data; |
| 71 | int Offset() const; |
| 72 | |
| 73 | // Return true iff the start of the remaining input matches "re" |
| 74 | bool LookingAt(const RE& re) const; |
| 75 | |
| 76 | // Return true iff all of the following are true |
| 77 | // a. the start of the remaining input matches "re", |
| 78 | // b. if any arguments are supplied, matched sub-patterns can be |
| 79 | // parsed and stored into the arguments. |
| 80 | // If it returns true, it skips over the matched input and any |
| 81 | // following input that matches the "skip" regular expression. |
| 82 | bool Consume(const RE& re, |
| 83 | const Arg& arg0 = RE::no_arg, |
| 84 | const Arg& arg1 = RE::no_arg, |
| 85 | const Arg& arg2 = RE::no_arg |
| 86 | // TODO: Allow more arguments? |
| 87 | ); |
| 88 | |
| 89 | // Set the "skip" regular expression. If after consuming some data, |
| 90 | // a prefix of the input matches this RE, it is automatically |
| 91 | // skipped. For example, a programming language scanner would use |
| 92 | // a skip RE that matches white space and comments. |
| 93 | // |
| 94 | // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); |
| 95 | // |
| 96 | // Skipping repeats as long as it succeeds. We used to let people do |
| 97 | // this by writing "(...)*" in the regular expression, but that added |
| 98 | // up to lots of recursive calls within the pcre library, so now we |
| 99 | // control repetition explicitly via the function call API. |
| 100 | // |
| 101 | // You can pass NULL for "re" if you do not want any data to be skipped. |
| 102 | void Skip(const char* re); // DEPRECATED; does *not* repeat |
| 103 | void SetSkipExpression(const char* re); |
| 104 | |
| 105 | // Temporarily pause "skip"ing. This |
| 106 | // Skip("Foo"); code ; DisableSkip(); code; EnableSkip() |
| 107 | // is similar to |
| 108 | // Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo"); |
| 109 | // but avoids creating/deleting new RE objects. |
| 110 | void DisableSkip(); |
| 111 | |
| 112 | // Reenable previously paused skipping. Any prefix of the input |
| 113 | // that matches the skip pattern is immediately dropped. |
| 114 | void EnableSkip(); |
| 115 | |
| 116 | /***** Special wrappers around SetSkip() for some common idioms *****/ |
| 117 | |
| 118 | // Arranges to skip whitespace, C comments, C++ comments. |
| 119 | // The overall RE is a disjunction of the following REs: |
| 120 | // \\s whitespace |
| 121 | // //.*\n C++ comment |
| 122 | // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) |
| 123 | // We get repetition via the semantics of SetSkipExpression, not by using * |
| 124 | void SkipCXXComments() { |
| 125 | SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); |
| 126 | } |
| 127 | |
| 128 | void set_save_comments(bool comments) { |
| 129 | save_comments_ = comments; |
| 130 | } |
| 131 | |
| 132 | bool save_comments() { |
| 133 | return save_comments_; |
| 134 | } |
| 135 | |
| 136 | // Append to vector ranges the comments found in the |
| 137 | // byte range [start,end] (inclusive) of the input data. |
| 138 | // Only comments that were extracted entirely within that |
| 139 | // range are returned: no range splitting of atomically-extracted |
| 140 | // comments is performed. |
| 141 | void GetComments(int start, int end, std::vector<StringPiece> *ranges); |
| 142 | |
| 143 | // Append to vector ranges the comments added |
| 144 | // since the last time this was called. This |
| 145 | // functionality is provided for efficiency when |
| 146 | // interleaving scanning with parsing. |
| 147 | void GetNextComments(std::vector<StringPiece> *ranges); |
| 148 | |
| 149 | private: |
| 150 | std::string data_; // All the input data |
| 151 | StringPiece input_; // Unprocessed input |
| 152 | RE* skip_; // If non-NULL, RE for skipping input |
| 153 | bool should_skip_; // If true, use skip_ |
| 154 | bool skip_repeat_; // If true, repeat skip_ as long as it works |
| 155 | bool save_comments_; // If true, aggregate the skip expression |
| 156 | |
| 157 | // the skipped comments |
| 158 | // TODO: later consider requiring that the StringPieces be added |
| 159 | // in order by their start position |
| 160 | std::vector<StringPiece> *comments_; |
| 161 | |
| 162 | // the offset into comments_ that has been returned by GetNextComments |
| 163 | int comments_offset_; |
| 164 | |
| 165 | // helper function to consume *skip_ and honour |
| 166 | // save_comments_ |
| 167 | void ConsumeSkip(); |
| 168 | }; |
| 169 | |
| 170 | } // namespace pcrecpp |
| 171 | |
| 172 | #endif /* _PCRE_SCANNER_H */ |