CTBxmlLexer.cxx Source File

00001 
00006 /*----------------------------------------------------------------------------*/
00007 /* C Tool Box: Designed and implemented by:                                   */
00008 /*    Walter F.J. Mueller   Gesellschaft fuer Schwerionenforschung (GSI)      */
00009 /*                          Planckstrasse 1, D-64291 Darmstadt, Germany       */
00010 /*                  Email:  W.F.J.Mueller@gsi.de                              */
00011 /*                  WWW:    http://www-kp3.gsi.de/www/kp3/people/mueller.html */
00012 /*----------------------------------------------------------------------------*/
00013 
00014 #include <ctype.h>
00015 
00016 #include "CTB.hxx"
00017 #include "CTB_Trace.hxx"
00018 #include "CTBosFill.hxx"
00019 #include "CTBexceptionBugcheck.hxx"
00020 
00021 #include "CTBxmlLexer.hxx"
00022 
00033 //------------------------------------------+-----------------------------------
00035 
00036 CTBxmlLexer::CTBxmlLexer()
00037   : mi_mode(0),
00038     mi_nline_t(1),
00039     mi_ncolumn_t(1),
00040     mi_nline_s(1),
00041     mi_ncolumn_s(1)
00042 {
00043   CTB_Trace("CTBxmlLexer()");
00044 }
00045 
00046 //------------------------------------------+-----------------------------------
00048 
00049 void CTBxmlLexer::Get(istream& is, CTBxmlLexerToken& lt)
00050 {
00051   CTB_Trace("CTBxmlLexer::Get(istream&, CTBxmlLexerToken&)");
00052   int i_c1;
00053   int i_quote;
00054   
00055   mi_nline_t   = mi_nline_s;
00056   mi_ncolumn_t = mi_ncolumn_s;
00057 
00058   lt.Clear();                               // set type to invalid ...
00059  
00060   i_c1 = GetChar(is);                       // get leading character
00061 
00062   if (i_c1 == EOF) {                        // this signals some error
00063     if (is.eof()) {                         // if at end-of-file
00064       lt.Type(CTBxmlLexerToken::eof);
00065     } else {                                // otherwise
00066       lt.Type(CTBxmlLexerToken::fail);
00067     }
00068     return;
00069   }
00070 
00071   lt += (char) i_c1;
00072   
00073   switch (mi_mode) {                        // handle different lexer modes
00074 
00075   case tagordata:
00076     if (i_c1 == '<') {                      // "<" -- any tag start ...
00077       int i_cp = is.peek();
00078 
00079       if (i_cp == '/') {                    // "</" -- empty tag open
00080         lt += (char) GetChar(is);
00081         lt.Type(CTBxmlLexerToken::etagopen);
00082         return;
00083       }
00084 
00085       if (i_cp == '?') {                    // "<?" -- pi open
00086         lt += (char) GetChar(is);
00087         lt.Type(CTBxmlLexerToken::piopen);
00088         return;
00089       }
00090     
00091       if (i_cp == '!') {                    // "<!" -- definition or comment
00092         lt += (char) GetChar(is);
00093         
00094         if (is.peek() == '-') {             // "<!-" -- begin of comment 
00095           lt += (char) GetChar(is);
00096           if (is.peek() == '-') {           // "<!--" -- comment start
00097             int i_cstate = 0;
00098             
00099             lt += (char) GetChar(is);
00100             
00101             for (;;) {                      // gobble up rest of comment
00102               int i_c2 = GetChar(is);
00103               
00104               if (i_c2 == EOF) {            // early end of comment
00105                 lt.Type(CTBxmlLexerToken::fail);
00106                 return;
00107               }
00108               
00109               lt += (char) i_c2;
00110               switch (i_cstate) {           // --> state machine
00111               case 0:                       // "^-->" state
00112                 if (i_c2 == '-') i_cstate = 1;
00113                 break;
00114               case 1:                       // "-^->" state
00115                 i_cstate = (i_c2 == '-') ? 2 : 0;
00116                 break;
00117               case 2:                       // "--^>" state
00118                 if (i_c2 == '>') {
00119                   lt.Type(CTBxmlLexerToken::comment);
00120                   return;
00121                 }
00122                 if (i_c2 != '-') i_cstate = 0; // this rule allows for ---> ....
00123                 break;
00124               }
00125             }
00126             return;
00127             
00128           } else {                          // "<!-x" -- bad comment start
00129             lt.Type(CTBxmlLexerToken::fail);
00130             return;
00131           }
00132         }
00133 
00134         lt.Type(CTBxmlLexerToken::defopen); // "<!" -- definition start
00135         return;
00136       }
00137 
00138       lt.Type(CTBxmlLexerToken::tagopen);   // "<" -- simple tag open
00139       return;  
00140     }
00141 
00142     if (i_c1 == '&') {                      // "&" -- entity
00143       GetEntity(is,lt);
00144       return;
00145     }    
00146 
00147     if (isspace(i_c1)) {                    // white space found
00148       GetWhite(is,lt);
00149       return;
00150     }
00151     
00152     lt.Type(CTBxmlLexerToken::data);        // must be PCDATA here
00153     for (;;) {                              // gobble up rest of data
00154       int i_c2 = is.peek();
00155       
00156       if (i_c2 == EOF) break;               // eof end data ...
00157       if (isspace(i_c2)) break;             // a white space interrupts data
00158       if (i_c2 == '<' || i_c2 == '&') break; // a < or & ends data
00159       
00160       lt += (char) GetChar(is);             // accumulate data 
00161     }
00162     return;
00163 
00164   case tagattr:
00165     if (i_c1 == '/') {                      // / found
00166       if (is.peek() == '>') {
00167         lt += (char) GetChar(is);
00168         lt.Type(CTBxmlLexerToken::etagclose); // "/>" -- end tag close
00169         return;
00170       }
00171       return; 
00172     }
00173 
00174     if (i_c1 == '?') {                      // ? found
00175       if (is.peek() == '>') {
00176         lt += (char) GetChar(is);
00177         lt.Type(CTBxmlLexerToken::piclose); // "?>" -- pi tag close
00178         return;
00179       }
00180       return; 
00181     }
00182   
00183     if (i_c1 == '>') {                      // simple > here, no / or ? before
00184       lt.Type(CTBxmlLexerToken::tagclose);    // ">" -- tag close
00185       return; 
00186     }
00187 
00188     if (i_c1 == '=') {
00189       lt.Type(CTBxmlLexerToken::equal);     // "=" -- equal char
00190       return; 
00191     }
00192   
00193     if (isspace(i_c1)) {                    // white space found
00194       GetWhite(is,lt);
00195       return;
00196     }
00197     
00198     if (isalpha(i_c1) || i_c1 == '_' || i_c1 == ':') { // name begin found ...
00199       GetName(is,lt);
00200       return;
00201     }
00202     return;
00203 
00204   case attrval:
00205     if (i_c1 == '\'') {
00206       lt.Type(CTBxmlLexerToken::squote);    // "'" -- single quote
00207       return; 
00208     }
00209   
00210     if (i_c1 == '"') {
00211       lt.Type(CTBxmlLexerToken::dquote);    // '"'-- double quote
00212       return; 
00213     }
00214   
00215     if (isspace(i_c1)) {                    // white space found
00216       GetWhite(is,lt);
00217       return;
00218     }
00219     
00220     if (isalpha(i_c1) || i_c1 == '_' || i_c1 == ':') { // name begin found ...
00221       GetName(is,lt);
00222       return;
00223     }
00224     return;
00225 
00226   case valsquote:
00227   case valdquote:
00228     i_quote = (mi_mode == valsquote) ? '\'' : '"';
00229     
00230     if (mi_mode == valsquote && i_c1 == '\'') {
00231       lt.Type(CTBxmlLexerToken::squote);    // "'" -- single quote
00232       return; 
00233     }
00234 
00235     if (mi_mode == valdquote && i_c1 == '"') {
00236       lt.Type(CTBxmlLexerToken::dquote);    // '"'-- double quote
00237       return; 
00238     }
00239     
00240     if (i_c1 == '&') {                      // "&" -- entity
00241       GetEntity(is,lt);
00242       return;
00243     }    
00244 
00245     if (isspace(i_c1)) {                    // white space found
00246       GetWhite(is,lt);
00247       return;
00248     }
00249     
00250     lt.Type(CTBxmlLexerToken::data);        // must be value data here
00251     for (;;) {                              // gobble up rest of data
00252       int i_c2 = is.peek();
00253       
00254       if (i_c2 == EOF) break;               // eof end data ...
00255       if (isspace(i_c2)) break;     // a white space interrupts data
00256       if (i_c2 == '<' || i_c2 == '&') break; // a < or & ends data
00257       if (i_c2 == i_quote) break;           // matching quote data
00258       
00259       lt += (char) GetChar(is);             // accumulate data 
00260     }
00261     return;
00262 
00263   default:
00264     throw CTBexceptionBugcheck("illegal state","CTBxmlLexer::Get()");
00265   }  
00266   return;
00267 }
00268 
00269 //------------------------------------------+-----------------------------------
00271 
00272 void CTBxmlLexer::Clear()
00273 {
00274   CTB_Trace("CTBxmlLexer::Clear()");
00275 
00276   mi_nline_t   = 1;
00277   mi_ncolumn_t = 1;
00278   mi_nline_s   = 1;
00279   mi_ncolumn_s = 1;
00280   return;
00281 }
00282 
00283 //------------------------------------------+-----------------------------------
00285 
00286 void CTBxmlLexer::Dump(int i_indent, ostream& os, const char* p_text) const
00287 {
00288   CTBosFill bl(i_indent);
00289   
00290   os << bl << "--CTBxmlLexer ";
00291   if (p_text) os << p_text;
00292   os << " @ " << this << endl;
00293 
00294   os << bl << "    mi_mode:      " << mi_mode << endl;
00295   os << bl << "    mi_nline_t:   " << mi_nline_t << endl;
00296   os << bl << "    mi_ncolumn_t: " << mi_ncolumn_t << endl;
00297   os << bl << "    mi_nline_s:   " << mi_nline_s << endl;
00298   os << bl << "    mi_ncolumn_s: " << mi_ncolumn_s << endl;
00299 
00300   return;
00301 }
00302 
00303 //------------------------------------------+-----------------------------------
00305 
00306 int CTBxmlLexer::GetChar(istream& is)
00307 {
00308   CTB_Trace("CTBxmlLexer::GetChar(istream&)");
00309   int i_char = is.get();
00310 
00311   if (i_char != EOF) {
00312     if (i_char == '\n') {
00313       mi_nline_s   += 1;
00314       mi_ncolumn_s  = 1;
00315     } else {
00316       mi_ncolumn_s += 1;
00317     }
00318   }
00319   return i_char;
00320 }
00321 
00322 //------------------------------------------+-----------------------------------
00324 
00325 void CTBxmlLexer::GetEntity(istream& is, CTBxmlLexerToken& lt)
00326 {
00327   CTB_Trace("CTBxmlLexer::GetEntity(istream&, CTBxmlLexerToken&)");
00328 
00329   if (is.peek() == '#') {                   // "&#" found
00330     lt += (char) GetChar(is);
00331     lt.Type(CTBxmlLexerToken::charref);     // "&#..;" -- character reference
00332   } else {
00333     lt.Type(CTBxmlLexerToken::entref);      // "&..;" -- entity reference
00334   }
00335     
00336   for (;;) {                                // gobble up rest of reference
00337     int i_c2 = GetChar(is);
00338     
00339     if (i_c2 == EOF) {                      // early end of reference
00340       lt.Type(CTBxmlLexerToken::fail);
00341       return;
00342     }
00343     
00344     lt += (char) i_c2;
00345     if (i_c2 == ';') break;
00346   }
00347   return;
00348 }
00349 
00350 //------------------------------------------+-----------------------------------
00352 
00353 void CTBxmlLexer::GetWhite(istream& is, CTBxmlLexerToken& lt)
00354 {
00355   CTB_Trace("CTBxmlLexer::GetWhite(istream&, CTBxmlLexerToken&)");
00356 
00357   lt.Type(CTBxmlLexerToken::white);
00358   while(isspace(is.peek())) {               // get remaining white space
00359     lt += (char) GetChar(is);
00360   }
00361   return;
00362 }
00363 
00364 //------------------------------------------+-----------------------------------
00366 
00367 void CTBxmlLexer::GetName(istream& is, CTBxmlLexerToken& lt)
00368 {
00369   CTB_Trace("CTBxmlLexer::GetName(istream&, CTBxmlLexerToken&)");
00370 
00371   lt.Type(CTBxmlLexerToken::name);
00372   for (;;) {
00373     int i_c2 = is.peek();
00374     if (isalnum(i_c2) || i_c2 == '.' || i_c2 == '-' || 
00375         i_c2 == '_' || i_c2 == ':') {       // if name char ...
00376       lt += GetChar(is);                    // add
00377     } else {
00378       break;                                // else stop
00379     }
00380   }
00381   return;
00382 }
00383 
00384 //##############################################################################
00385 
00396 //------------------------------------------+-----------------------------------
00398 
00399 void CTBxmlLexerToken::ToStream(ostream& os) const
00400 {
00401   os << "type: ";  
00402 
00403   switch (Type()) {
00404   case invalid:    os << "invalid";   break;
00405   case fail:       os << "fail";      break;
00406   case eof:        os << "eof";       break;
00407   case tagopen:    os << "tagopen";   break;
00408   case etagopen:   os << "etagopen";  break;
00409   case tagclose:   os << "tagclose";  break;
00410   case etagclose:  os << "etagclose"; break;
00411   case comment:    os << "comment";   break;
00412   case defopen:    os << "defopen";   break;
00413   case piopen:     os << "piopen";    break;
00414   case piclose:    os << "piclose";   break;
00415   case charref:    os << "charref";   break;
00416   case entref:     os << "entref";    break;
00417   case equal:      os << "equal";     break;
00418   case squote:     os << "squote";    break;
00419   case dquote:     os << "dquote";    break;
00420   case name:       os << "name";      break;
00421   case white:      os << "white";     break;
00422   case data:       os << "data";      break;
00423   default:         os << "???";       break;
00424   }
00425 
00426   os << " -- \"" << m_text.Escape() << "\"";
00427   
00428   return;
00429 }
00430 
00431 //------------------------------------------+-----------------------------------
00433 
00434 void CTBxmlLexerToken::Dump(int i_indent, ostream& os, const char* p_text) const
00435 {
00436   CTBosFill bl(i_indent);
00437   
00438   os << bl << "--CTBxmlLexerToken ";
00439   if (p_text) os << p_text;
00440   os << " @ " << this << endl;
00441 
00442   os << bl << "    mi_type:      " << mi_type << endl;
00443   m_text.Dump(i_indent+2,os,"m_text");
00444 
00445   return;
00446 }