View Javadoc

1   /**
2    * Copyright (c) 2011, University of Konstanz, Distributed Systems Group
3    * All rights reserved.
4    * 
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions are met:
7    * * Redistributions of source code must retain the above copyright
8    * notice, this list of conditions and the following disclaimer.
9    * * Redistributions in binary form must reproduce the above copyright
10   * notice, this list of conditions and the following disclaimer in the
11   * documentation and/or other materials provided with the distribution.
12   * * Neither the name of the University of Konstanz nor the
13   * names of its contributors may be used to endorse or promote products
14   * derived from this software without specific prior written permission.
15   * 
16   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19   * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26   */
27  
28  package org.treetank.service.xml.xpath.parser;
29  
30  /**
31   * <h1>XPathScanner</h1>
32   * <p>
33   * Lexical scanner to extract tokens from the query.
34   * </p>
35   * <p>
36   * This scanner is used to interpret the query. It reads the the query string char by char and specifies the
37   * type of the input and creates a token for every logic text unit.
38   * </p>
39   */
40  public final class XPathScanner {
41  
42      /** The XPath query to scan. */
43      private final String mQuery;
44  
45      /** The current position of the cursor to the query string. */
46      private int mPos;
47  
48      /** Start position of the last item. */
49      private int mLastPos;
50  
51      /** Scanner states. */
52      private enum State {
53          /** Start state. */
54          START,
55          /** Number state. */
56          NUMBER,
57          /** Text state. */
58          TEXT,
59          /** Special state. */
60          SPECIAL,
61          /** Special state with 2 caracters. */
62          SPECIAL2,
63          /** Comment state. */
64          COMMENT,
65          /** Enum state. */
66          E_NUM,
67          /** Unknown state. */
68          UNKNOWN
69      }
70  
71      /** The state the scanner is currently in. */
72      private State mState;
73  
74      /** Contains the current content of the token. */
75      private StringBuilder mOutput;
76  
77      /**
78       * Defines if all digits of a token have been read or if the token still can
79       * have more digits.
80       */
81      private boolean mFinnished;
82  
83      /** The type of the current token. */
84      private TokenType mType;
85  
86      /** The current character. */
87      private char mInput;
88  
89      /**
90       * State with which the next token starts. Sometimes it is not needed to
91       * start in the start state for some tokens, as their type is known because
92       * of the preceding token.
93       */
94      private State mStartState;
95  
96      /**
97       * Counts the number of nested comments. Is needed to distinguish whether
98       * the current token is part of a comment, or part of the query. If
99       * mCommentCount > 0, the current token is part of a comment.
100      */
101     private int mCommentCount;
102 
103     /**
104      * Constructor. Initializes the internal state. Receives query and adds a
105      * end mark to it.
106      * 
107      * @param mQuery
108      *            the query to scan
109      */
110     public XPathScanner(final String mQuery) {
111 
112         this.mQuery = mQuery + '#'; // end mark to recognize the end
113         mPos = 0;
114         mLastPos = mPos;
115         mStartState = State.START;
116         mCommentCount = 0;
117     }
118 
119     /**
120      * Reads the string char by char and returns one token by call. The scanning
121      * starts in the start state, if not further specified before, and specifies
122      * the next scanner state and the type of the future token according to its
123      * first char. As soon as the current char does not fit the conditions for
124      * the current token type, the token is generated and returned.
125      * 
126      * @return token The new token.
127      */
128     public IXPathToken nextToken() {
129 
130         // some tokens start in another state than the START state
131         mState = mStartState;
132         // reset startState
133         mStartState = State.START;
134         mOutput = new StringBuilder();
135         mFinnished = false;
136         mType = TokenType.INVALID;
137         mLastPos = mPos;
138 
139         do {
140             mInput = mQuery.charAt(mPos);
141 
142             switch (mState) {
143             case START: // specify token type according to first char
144                 scanStart();
145                 break;
146             case NUMBER: // number
147                 scanNumber();
148                 break;
149             case TEXT: // some text, could be a name
150                 scanText();
151                 break;
152             case SPECIAL2: // special character that could have 2 digits
153                 scanTwoDigitSpecial();
154                 break;
155             case COMMENT:
156                 scanComment();
157                 break;
158             case E_NUM:
159                 scanENum();
160                 break;
161             default:
162                 mPos++;
163                 mFinnished = true;
164             }
165         } while (!mFinnished || mPos >= mQuery.length());
166 
167         if (mCommentCount > 0) {
168             throw new IllegalStateException("Error in Query. Comment does not end.");
169         }
170 
171         return new VariableXPathToken(mOutput.toString(), mType);
172     }
173 
174     /**
175      * Scans the first character of a token and decides, what type it is.
176      */
177     private void scanStart() {
178 
179         if (isNumber(mInput)) {
180             mState = State.NUMBER;
181             mOutput.append(mInput);
182             mType = TokenType.VALUE; // number
183         } else if (isFirstLetter(mInput)) {
184             mState = State.TEXT; // word
185             mOutput.append(mInput);
186             mType = TokenType.TEXT;
187         } else if (isSpecial(mInput)) {
188             mState = State.SPECIAL; // special character with only one digit
189             mOutput.append(mInput);
190             mType = retrieveType(mInput);
191             mFinnished = true;
192         } else if (isTwoDigistSpecial(mInput)) {
193             mState = State.SPECIAL2; // 2 digit special character
194             mOutput.append(mInput);
195             mType = retrieveType(mInput);
196         } else if ((mInput == ' ') || (mInput == '\n')) {
197             mState = State.START;
198             mOutput.append(mInput);
199             mFinnished = true;
200             mType = TokenType.SPACE;
201         } else if (mInput == '#') {
202             mType = TokenType.END; // end of query
203             mFinnished = true;
204             mPos--;
205         } else {
206             mState = State.UNKNOWN; // unknown character
207             mOutput.append(mInput);
208             mFinnished = true;
209         }
210         mPos++;
211     }
212 
213     /**
214      * Returns the type of the given character.
215      * 
216      * @param paramInput
217      *            The character the type should be determined
218      * @return type of the given character.
219      */
220     private TokenType retrieveType(final char paramInput) {
221 
222         TokenType type;
223         switch (paramInput) {
224         case ',':
225             type = TokenType.COMMA;
226             break;
227         case '(':
228             type = TokenType.OPEN_BR;
229             break;
230         case ')':
231             type = TokenType.CLOSE_BR;
232             break;
233         case '[':
234             type = TokenType.OPEN_SQP;
235             break;
236         case ']':
237             type = TokenType.CLOSE_SQP;
238             break;
239         case '@':
240             type = TokenType.AT;
241             break;
242         case '=':
243             type = TokenType.EQ;
244             break;
245         case '<':
246         case '>':
247             type = TokenType.COMP;
248             break;
249         case '!':
250             type = TokenType.N_EQ;
251             break;
252         case '/':
253             type = TokenType.SLASH;
254             break;
255         case ':':
256             type = TokenType.COLON;
257             break;
258         case '.':
259             type = TokenType.POINT;
260             break;
261         case '+':
262             type = TokenType.PLUS;
263             break;
264         case '-':
265             type = TokenType.MINUS;
266             break;
267         case '\'':
268             type = TokenType.SINGLE_QUOTE;
269             break;
270         case '"':
271             type = TokenType.DBL_QUOTE;
272             break;
273         case '$':
274             type = TokenType.DOLLAR;
275             break;
276         case '?':
277             type = TokenType.INTERROGATION;
278             break;
279         case '*':
280             type = TokenType.STAR;
281             break;
282         case '|':
283             type = TokenType.OR;
284             break;
285         default:
286             type = TokenType.INVALID;
287         }
288         return type;
289 
290     }
291 
292     /**
293      * Checks if the given character is a valid first letter.
294      * 
295      * @param paramInput
296      *            The character to check.
297      * @return Returns true, if the character is a first letter.
298      */
299     private boolean isFirstLetter(final char paramInput) {
300 
301         return (paramInput >= 'a' && paramInput <= 'z') || (paramInput >= 'A' && paramInput <= 'Z')
302             || (paramInput == '_');
303     }
304 
305     /**
306      * Checks if the given character is a number.
307      * 
308      * @param paramInput
309      *            The character to check.
310      * @return Returns true, if the character is a number.
311      */
312     private boolean isNumber(final char paramInput) {
313 
314         return paramInput >= '0' && paramInput <= '9';
315     }
316 
317     /**
318      * Checks if the given character is a special character that can have 2
319      * digits.
320      * 
321      * @param paramInput
322      *            The character to check.
323      * @return Returns true, if the character is a special character that can
324      *         have 2 digits.
325      */
326     private boolean isTwoDigistSpecial(final char paramInput) {
327 
328         return (paramInput == '<') || (paramInput == '>') || (paramInput == '(') || (paramInput == '!')
329             || (paramInput == '/') || (paramInput == '.');
330     }
331 
332     /**
333      * Checks if the given character is a special character.
334      * 
335      * @param paramInput
336      *            The character to check.
337      * @return Returns true, if the character is a special character.
338      */
339     private boolean isSpecial(final char paramInput) {
340 
341         return (paramInput == ')') || (paramInput == ';') || (paramInput == ',') || (paramInput == '@')
342             || (paramInput == '[') || (paramInput == ']') || (paramInput == '=') || (paramInput == '"')
343             || (paramInput == '\'') || (paramInput == '$') || (paramInput == ':') || (paramInput == '|')
344             || (paramInput == '+') || (paramInput == '-') || (paramInput == '?') || (paramInput == '*');
345     }
346 
347     /**
348      * Scans a number token. A number only consists of digits.
349      */
350     private void scanNumber() {
351 
352         if (mInput >= '0' && mInput <= '9') {
353             mOutput.append(mInput);
354             mPos++;
355         } else {
356             // could be an e-number
357             if (mInput == 'E' || mInput == 'e') {
358                 mStartState = State.E_NUM;
359             }
360             mFinnished = true;
361         }
362     }
363 
364     /**
365      * Scans text token. A text is everything that with a character. It can
366      * contain numbers, all letters in upper or lower case and underscores.
367      */
368     private void scanText() {
369 
370         if (isLetter(mInput)) {
371             mOutput.append(mInput);
372             mPos++;
373 
374         } else {
375             mType = TokenType.TEXT;
376             mFinnished = true;
377         }
378     }
379 
380     /**
381      * Scans special characters that can have more then one digit. E.g. ==, !=,
382      * <=, >=, //, .., (:
383      */
384     private void scanTwoDigitSpecial() {
385 
386         if (mInput == '=' && (mType == TokenType.COMP || mType == TokenType.EQ || mType == TokenType.N_EQ)) {
387             mOutput.append(mInput);
388             mPos++;
389         } else if (mInput == '/' && (mType == TokenType.SLASH)) {
390             mOutput.append(mInput);
391             mType = TokenType.DESC_STEP;
392             mPos++;
393         } else if (mInput == '.' && (mType == TokenType.POINT)) {
394             mOutput.append(mInput);
395             mType = TokenType.PARENT;
396             mPos++;
397         } else if (mInput == '<' && mOutput.toString().equals("<")) {
398             mOutput.append(mInput);
399             mType = TokenType.L_SHIFT;
400             mPos++;
401         } else if (mInput == '>' && mOutput.toString().equals(">")) {
402             mOutput.append(mInput);
403             mType = TokenType.R_SHIFT;
404             mPos++;
405         } else if (mInput == ':' && mType == TokenType.OPEN_BR) {
406             // could be start of a comment
407             mOutput = new StringBuilder();
408             mType = TokenType.COMMENT;
409             mCommentCount++;
410             mState = State.COMMENT;
411             mPos++;
412         } else {
413             mFinnished = true;
414         }
415     }
416 
417     /**
418      * Scans all numbers that contain an e.
419      */
420     private void scanENum() {
421 
422         if (mInput == 'E' || mInput == 'e') {
423             mOutput.append(mInput);
424             mState = State.START;
425             mType = TokenType.E_NUMBER;
426             mFinnished = true;
427             mPos++;
428         } else {
429             mFinnished = true;
430             mState = State.START;
431             mType = TokenType.INVALID;
432         }
433     }
434 
435     /**
436      * Scans comments.
437      */
438     private void scanComment() {
439         final char input = mQuery.charAt(mPos + 1);
440         if (mInput == ':') {
441             // check if is end of comment, indicated by ':)'
442             if (input == ')') {
443                 mCommentCount--;
444                 if (mCommentCount == 0) {
445                     mState = State.START;
446                     // increment position, because next digit has already been
447                     // processed
448                     mPos++;
449                 }
450 
451             }
452         } else if (mInput == '(') {
453             // check if start of new nested comment, indicated by '(:'
454             if (input == ':') {
455                 mCommentCount++;
456 
457             }
458         }
459         mPos++;
460     }
461 
462     /**
463      * Checks if the given character is a letter.
464      * 
465      * @param paramInput
466      *            The character to check.
467      * @return Returns true, if the character is a letter.
468      */
469     private boolean isLetter(final char paramInput) {
470 
471         return (paramInput >= '0' && paramInput <= '9') || (paramInput >= 'a' && paramInput <= 'z')
472             || (paramInput >= 'A' && paramInput <= 'Z') || (paramInput == '_') || (paramInput == '-')
473             || (paramInput == '.');
474 
475     }
476 
477     /**
478      * Return the token that will be returned by the scanner after the call of
479      * nextToken(), without changing the internal state of the scanner.
480      * 
481      * @param paramNext
482      *            number of next tokens to be read
483      * @return token that will be read after calling nextToken()
484      */
485     public IXPathToken lookUpTokens(final int paramNext) {
486 
487         int nextCount = paramNext;
488 
489         // save current position of the scanner, to restore it later
490         final int lastPos = mPos;
491         IXPathToken token = nextToken();
492 
493         while (--nextCount > 0) {
494             token = nextToken();
495             if (token.getType() == TokenType.SPACE) {
496                 nextCount++;
497             }
498         }
499 
500         // reset position
501         mPos = lastPos;
502         return token;
503     }
504 
505     /**
506      * Returns the beginning of a query that has already been scanned. This can
507      * be used by the client e.g. for error messages in case of unexpected token
508      * occurs.
509      * 
510      * @return string so far
511      */
512     public String begin() {
513 
514         return mQuery.substring(0, mLastPos);
515     }
516 
517     /**
518      * Return the current cursor position in the query.
519      * 
520      * @return current position of the cursor
521      */
522     public int getPos() {
523 
524         return mPos;
525     }
526 
527     /**
528      * {@inheritDoc}
529      */
530     public String toString() {
531         return mQuery;
532     }
533 }