source: trunk/third/mozilla/htmlparser/src/nsHTMLTokenizer.cpp @ 20551

Revision 20551, 34.3 KB checked in by rbasch, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20550, which included commits to RCS files with non-trunk default branches.
Line 
1/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2/* ***** BEGIN LICENSE BLOCK *****
3 * Version: NPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Netscape Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/NPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is mozilla.org code.
16 *
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 *
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the NPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the NPL, the GPL or the LGPL.
36 *
37 * ***** END LICENSE BLOCK ***** */
38
39
40/**
41 * MODULE NOTES:
42 * @update  gess 4/1/98
43 *
44 */
45
46#include "nsIAtom.h"
47#include "nsHTMLTokenizer.h"
48#include "nsScanner.h"
49#include "nsElementTable.h"
50#include "CParserContext.h"
51#include "nsReadableUtils.h"
52#include "nsUnicharUtils.h"
53
54/************************************************************************
55  And now for the main class -- nsHTMLTokenizer...
56 ************************************************************************/
57
58static NS_DEFINE_IID(kISupportsIID,   NS_ISUPPORTS_IID);                 
59static NS_DEFINE_IID(kITokenizerIID,  NS_ITOKENIZER_IID);
60static NS_DEFINE_IID(kClassIID,       NS_HTMLTOKENIZER_IID);
61
62/**
63 *  This method gets called as part of our COM-like interfaces.
64 *  Its purpose is to create an interface to parser object
65 *  of some type.
66 * 
67 *  @update   gess 4/8/98
68 *  @param    nsIID  id of object to discover
69 *  @param    aInstancePtr ptr to newly discovered interface
70 *  @return   NS_xxx result code
71 */
72nsresult nsHTMLTokenizer::QueryInterface(const nsIID& aIID, void** aInstancePtr) 
73{                                                                       
74  if (NULL == aInstancePtr) {                                           
75    return NS_ERROR_NULL_POINTER;                                       
76  }                                                                     
77
78  if(aIID.Equals(kISupportsIID))    {  //do IUnknown...
79    *aInstancePtr = (nsISupports*)(this);                                       
80  }
81  else if(aIID.Equals(kITokenizerIID)) {  //do IParser base class...
82    *aInstancePtr = (nsITokenizer*)(this);                                       
83  }
84  else if(aIID.Equals(kClassIID)) {  //do this class...
85    *aInstancePtr = (nsHTMLTokenizer*)(this);                                       
86  }                 
87  else {
88    *aInstancePtr=0;
89    return NS_NOINTERFACE;
90  }
91  NS_ADDREF_THIS();
92  return NS_OK;                                                       
93}
94
95/**
96 *  This method is defined in nsHTMLTokenizer.h. It is used to
97 *  cause the COM-like construction of an HTMLTokenizer.
98 * 
99 *  @update  gess 4/8/98
100 *  @param   nsIParser** ptr to newly instantiated parser
101 *  @return  NS_xxx error result
102 */
103
104nsresult NS_NewHTMLTokenizer(nsITokenizer** aInstancePtrResult,
105                                         PRInt32 aFlag,
106                                         eParserDocType aDocType,
107                                         eParserCommands aCommand)
108{
109  NS_PRECONDITION(nsnull != aInstancePtrResult, "null ptr");
110  if (nsnull == aInstancePtrResult) {
111    return NS_ERROR_NULL_POINTER;
112  }
113  nsHTMLTokenizer* it = new nsHTMLTokenizer(aFlag,aDocType,aCommand);
114  if (nsnull == it) {
115    return NS_ERROR_OUT_OF_MEMORY;
116  }
117  return it->QueryInterface(kClassIID, (void **) aInstancePtrResult);
118}
119
120
121NS_IMPL_ADDREF(nsHTMLTokenizer)
122NS_IMPL_RELEASE(nsHTMLTokenizer)
123
124
125/**
126 *  Default constructor
127 *   
128 *  @update  gess 4/9/98
129 *  @param   
130 *  @return 
131 */
132 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode,
133                                  eParserDocType aDocType,
134                                  eParserCommands aCommand) :
135  nsITokenizer(), mTokenDeque(0)
136{
137  if (aParseMode==eDTDMode_full_standards ||
138      aParseMode==eDTDMode_almost_standards) {
139    mFlags = NS_IPARSER_FLAG_STRICT_MODE;
140  }
141  else if (aParseMode==eDTDMode_quirks)  {
142    mFlags = NS_IPARSER_FLAG_QUIRKS_MODE;
143  }
144  else if (aParseMode==eDTDMode_autodetect) {
145    mFlags = NS_IPARSER_FLAG_AUTO_DETECT_MODE;
146  }
147  else {
148    mFlags = NS_IPARSER_FLAG_UNKNOWN_MODE;
149  }
150
151  if (aDocType==ePlainText) {
152    mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
153  }
154  else if (aDocType==eXML) {
155    mFlags |= NS_IPARSER_FLAG_XML;
156  }
157  else if (aDocType==eHTML_Quirks ||
158           aDocType==eHTML3_Quirks ||
159           aDocType==eHTML_Strict) {
160    mFlags |= NS_IPARSER_FLAG_HTML;
161  }
162 
163  mFlags |= (aCommand==eViewSource)? NS_IPARSER_FLAG_VIEW_SOURCE:NS_IPARSER_FLAG_VIEW_NORMAL;
164
165  mTokenAllocator = nsnull;
166  mTokenScanPos = 0;
167  mPreserveTarget = eHTMLTag_unknown;
168}
169
170
171/**
172 *  Destructor
173 *   
174 *  @update  gess 4/9/98
175 *  @param   
176 *  @return 
177 */
178nsHTMLTokenizer::~nsHTMLTokenizer(){
179  if(mTokenDeque.GetSize()){
180    CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool());
181    mTokenDeque.ForEach(theDeallocator);
182  }
183}
184 
185
186/*******************************************************************
187  Here begins the real working methods for the tokenizer.
188 *******************************************************************/
189
190void nsHTMLTokenizer::AddToken(CToken*& aToken,nsresult aResult,nsDeque* aDeque,nsTokenAllocator* aTokenAllocator) {
191  if(aToken && aDeque) {
192    if(NS_SUCCEEDED(aResult)) {
193      aDeque->Push(aToken);
194    }
195    else {
196      IF_FREE(aToken, aTokenAllocator);
197    }
198  }
199}
200
201/**
202 * Retrieve a ptr to the global token recycler...
203 * @update      gess8/4/98
204 * @return  ptr to recycler (or null)
205 */
206nsTokenAllocator* nsHTMLTokenizer::GetTokenAllocator(void) {
207  return mTokenAllocator;
208}
209
210
211/**
212 * This method provides access to the topmost token in the tokenDeque.
213 * The token is not really removed from the list.
214 * @update      gess8/2/98
215 * @return  ptr to token
216 */
217CToken* nsHTMLTokenizer::PeekToken() {
218  return (CToken*)mTokenDeque.PeekFront();
219}
220
221
222/**
223 * This method provides access to the topmost token in the tokenDeque.
224 * The token is really removed from the list; if the list is empty we return 0.
225 * @update      gess8/2/98
226 * @return  ptr to token or NULL
227 */
228CToken* nsHTMLTokenizer::PopToken() {
229  CToken* result=nsnull;
230  result=(CToken*)mTokenDeque.PopFront();
231  return result;
232}
233
234
235/**
236 *
237 * @update      gess8/2/98
238 * @param
239 * @return
240 */
241CToken* nsHTMLTokenizer::PushTokenFront(CToken* theToken) {
242  mTokenDeque.PushFront(theToken);
243  return theToken;
244}
245
246/**
247 *
248 * @update      gess8/2/98
249 * @param
250 * @return
251 */
252CToken* nsHTMLTokenizer::PushToken(CToken* theToken) {
253  mTokenDeque.Push(theToken);
254  return theToken;
255}
256
257/**
258 *
259 * @update      gess12/29/98
260 * @param
261 * @return
262 */
263PRInt32 nsHTMLTokenizer::GetCount(void) {
264  return mTokenDeque.GetSize();
265}
266
267/**
268 *
269 * @update      gess12/29/98
270 * @param
271 * @return
272 */
273CToken* nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex){
274  return (CToken*)mTokenDeque.ObjectAt(anIndex);
275}
276
277/**
278 * @update      gess 12/29/98
279 * @update      harishd 08/04/00
280 * @param
281 * @return
282 */
283nsresult nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk,nsTokenAllocator* aTokenAllocator)
284{
285  mTokenAllocator=aTokenAllocator;
286  mIsFinalChunk=aIsFinalChunk;
287  mTokenScanPos=mTokenDeque.GetSize(); //cause scanDocStructure to search from here for new tokens...
288  return NS_OK;
289}
290
291/**
292 *
293 * @update      gess12/29/98
294 * @param
295 * @return
296 */
297void nsHTMLTokenizer::PrependTokens(nsDeque& aDeque){
298
299  PRInt32 aCount=aDeque.GetSize();
300 
301  //last but not least, let's check the misplaced content list.
302  //if we find it, then we have to push it all into the body before continuing...
303  PRInt32 anIndex=0;
304  for(anIndex=0;anIndex<aCount;++anIndex){
305    CToken* theToken=(CToken*)aDeque.Pop();
306    PushTokenFront(theToken);
307  }
308
309}
310
311NS_IMETHODIMP
312nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer)
313{
314  if (aTokenizer) {
315    mFlags &= ~NS_IPARSER_FLAG_PRESERVE_CONTENT;
316    mPreserveTarget =
317      NS_STATIC_CAST(nsHTMLTokenizer*, aTokenizer)->mPreserveTarget;
318    if (mPreserveTarget != eHTMLTag_unknown)
319      mFlags |= NS_IPARSER_FLAG_PRESERVE_CONTENT;
320  }
321  return NS_OK;
322}
323
324/**
325 * This is a utilty method for ScanDocStructure, which finds a given
326 * tag in the stack.
327 *
328 * @update      gess 08/30/00
329 * @param   aTag -- the ID of the tag we're seeking
330 * @param   aTagStack -- the stack to be searched
331 * @return  index pos of tag in stack if found, otherwise kNotFound
332 */
333static PRInt32 FindLastIndexOfTag(eHTMLTags aTag,nsDeque &aTagStack) {
334  PRInt32 theCount=aTagStack.GetSize();
335 
336  while(0<theCount) {
337    CHTMLToken *theToken=(CHTMLToken*)aTagStack.ObjectAt(--theCount); 
338    if(theToken) {
339      eHTMLTags  theTag=(eHTMLTags)theToken->GetTypeID();
340      if(theTag==aTag) {
341        return theCount;
342      }
343    }
344  }
345
346  return kNotFound;
347}
348
349/**
350 * This method scans the sequence of tokens to determine the
351 * well formedness of each tag structure. This is used to
352 * disable residual-style handling in well formed cases.
353 *
354 * @update      gess 1Sep2000
355 * @param
356 * @return
357 */
358nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk) {
359  nsresult result=NS_OK;
360  if (!mTokenDeque.GetSize())
361    return result;
362
363  CHTMLToken  *theRootToken=0;
364
365    //*** start by finding the first start tag that hasn't been reviewed.
366
367  while(mTokenScanPos>0) {
368    theRootToken=(CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos);
369    if(theRootToken) {
370      eHTMLTokenTypes theType=eHTMLTokenTypes(theRootToken->GetTokenType()); 
371      if(eToken_start==theType) {
372        if(eFormUnknown==theRootToken->GetContainerInfo()) {
373          break;
374        }
375      }     
376    }
377    mTokenScanPos--;
378  }
379
380  /*----------------------------------------------------------------------
381   *  Now that we know where to start, let's walk through the
382   *  tokens to see which are well-formed. Stop when you run out
383   *  of fresh tokens.
384   *---------------------------------------------------------------------*/
385
386  theRootToken=(CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos); //init to root
387
388  nsDeque       theStack(0);
389  eHTMLTags     theRootTag=eHTMLTag_unknown;
390  CHTMLToken    *theToken=theRootToken; //init to root
391  PRInt32       theStackDepth=0;   
392
393  static  const PRInt32 theMaxStackDepth=200;   //dont bother if we get ridiculously deep.
394
395  while(theToken && (theStackDepth<theMaxStackDepth)) {
396
397    eHTMLTokenTypes theType=eHTMLTokenTypes(theToken->GetTokenType());
398    eHTMLTags       theTag=(eHTMLTags)theToken->GetTypeID();
399
400    PRBool          theTagIsContainer=nsHTMLElement::IsContainer(theTag);  //bug54117...
401
402    if(theTagIsContainer) {
403      PRBool          theTagIsBlock=gHTMLElements[theTag].IsMemberOf(kBlockEntity);
404      PRBool          theTagIsInline= (theTagIsBlock) ? PR_FALSE : gHTMLElements[theTag].IsMemberOf(kInlineEntity);
405
406      if(theTagIsBlock || theTagIsInline || (eHTMLTag_table==theTag)) {
407
408        switch(theType) {
409
410          case eToken_start:
411            if(0==theStack.GetSize()) {
412                //track the tag on the top of the stack...
413              theRootToken=theToken;
414              theRootTag=theTag;
415            }
416            theStack.Push(theToken);
417            ++theStackDepth;
418            break;
419
420          case eToken_end:
421            {
422              CHTMLToken *theLastToken= NS_STATIC_CAST(CHTMLToken*, theStack.Peek());
423              if(theLastToken) {
424                if(theTag==theLastToken->GetTypeID()) {
425                  theStack.Pop(); //yank it for real
426                  theStackDepth--;
427                  theLastToken->SetContainerInfo(eWellFormed);
428
429                  //in addition, let's look above this container to see if we can find
430                  //any tags that are already marked malformed. If so, pop them too!
431
432                  theLastToken= NS_STATIC_CAST(CHTMLToken*, theStack.Peek());
433                  while(theLastToken) {
434                    if(eMalformed==theRootToken->GetContainerInfo()) {
435                      theStack.Pop(); //yank the malformed token for real.
436                      theLastToken= NS_STATIC_CAST(CHTMLToken*, theStack.Peek());
437                      continue;
438                    }
439                    break;
440                  }
441                }
442                else {
443                  //the topmost token isn't what we expected, so that container must
444                  //be malformed. If the tag is a block, we don't really care (but we'll
445                  //mark it anyway). If it's an inline we DO care, especially if the
446                  //inline tried to contain a block (that's when RS handling kicks in).
447                  if(theTagIsInline) {
448                    PRInt32 theIndex=FindLastIndexOfTag(theTag,theStack);
449                    if(kNotFound!=theIndex) {
450                      theToken=(CHTMLToken*)theStack.ObjectAt(theIndex);                       
451                      theToken->SetContainerInfo(eMalformed);
452                    }
453                    //otherwise we ignore an out-of-place end tag.
454                  }
455                  else {
456                  }
457                }
458              }
459            }
460            break;
461
462          default:
463            break;
464        } //switch
465
466      }
467    }
468
469    theToken=(CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos);
470  }
471
472  return result;
473}
474
475nsresult nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk) {
476  return ScanDocStructure(aFinalChunk);
477}
478
479/**
480 *  This method repeatedly called by the tokenizer.
481 *  Each time, we determine the kind of token were about to
482 *  read, and then we call the appropriate method to handle
483 *  that token type.
484 * 
485 *  @update gess 3/25/98
486 *  @param  aChar: last char read
487 *  @param  aScanner: see nsScanner.h
488 *  @param  anErrorCode: arg that will hold error condition
489 *  @return new token or null
490 */
491nsresult nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner,PRBool& aFlushTokens) {
492
493  PRUnichar theChar;
494  CToken* theToken=0;
495
496  nsresult result=aScanner.Peek(theChar);
497
498  switch(result) {
499    case kEOF:
500        //We convert from eof to complete here, because we never really tried to get data.
501        //All we did was try to see if data was available, which it wasn't.
502        //It's important to return process complete, so that controlling logic can know that
503        //everything went well, but we're done with token processing.
504      return result;
505
506    case NS_OK:
507    default:
508
509      if(!(mFlags & NS_IPARSER_FLAG_PLAIN_TEXT)) {
510        if(kLessThan==theChar) {
511          return ConsumeTag(theChar,theToken,aScanner,aFlushTokens);
512        }
513        else if(kAmpersand==theChar){
514          return ConsumeEntity(theChar,theToken,aScanner);
515        }
516      }
517     
518      if((kCR==theChar) || (kLF==theChar)) {
519        return ConsumeNewline(theChar,theToken,aScanner);
520      }
521      else {
522        if(!nsCRT::IsAsciiSpace(theChar)) {
523          if(theChar!=nsnull) {
524            result=ConsumeText(theToken,aScanner);
525          }
526          else {
527            aScanner.GetChar(theChar); // skip the embedded null char. Fix bug 64098.
528          }
529          break;
530        }
531        result=ConsumeWhitespace(theChar,theToken,aScanner);
532      }
533      break;
534  } //switch
535
536  return result;
537}
538
539
540/**
541 *  This method is called just after a "<" has been consumed
542 *  and we know we're at the start of some kind of tagged
543 *  element. We don't know yet if it's a tag or a comment.
544 * 
545 *  @update  gess 5/12/98
546 *  @param   aChar is the last char read
547 *  @param   aScanner is represents our input source
548 *  @param   aToken is the out arg holding our new token
549 *  @return  error code.
550 */
551nsresult nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner,PRBool& aFlushTokens) {
552
553  PRUnichar theNextChar, oldChar;
554  nsresult result=aScanner.Peek(aChar,1);
555
556  if(NS_OK==result) {
557
558    switch(aChar) {
559      case kForwardSlash:
560        // Get the original "<" (we've already seen it with a Peek)
561        aScanner.GetChar(oldChar);
562
563        result=aScanner.Peek(theNextChar, 1);
564        if(NS_OK==result) {
565          // xml allow non ASCII tag name, consume as end tag. need to make xml view source work
566          PRBool isXML=(mFlags & NS_IPARSER_FLAG_XML);
567          if(nsCRT::IsAsciiAlpha(theNextChar)||(kGreaterThan==theNextChar)||
568             (isXML && (! nsCRT::IsAscii(theNextChar)))) {
569            result=ConsumeEndTag(aChar,aToken,aScanner);
570          }
571          else result=ConsumeComment(aChar,aToken,aScanner);
572        }//if
573        break;
574
575      case kExclamation:
576        // Get the original "<" (we've already seen it with a Peek)
577        aScanner.GetChar(oldChar);
578
579        result=aScanner.Peek(theNextChar, 1);
580        if(NS_OK==result) {
581          if((kMinus==theNextChar) || (kGreaterThan==theNextChar)) {
582            result=ConsumeComment(aChar,aToken,aScanner);
583          }
584          else
585            result=ConsumeSpecialMarkup(aChar,aToken,aScanner);
586        }
587        break;
588
589      case kQuestionMark: //it must be an XML processing instruction...
590        // Get the original "<" (we've already seen it with a Peek)
591        aScanner.GetChar(oldChar);
592        result=ConsumeProcessingInstruction(aChar,aToken,aScanner);
593        break;
594
595      default:
596        if(nsCRT::IsAsciiAlpha(aChar)) {
597          // Get the original "<" (we've already seen it with a Peek)
598          aScanner.GetChar(oldChar);
599          result=ConsumeStartTag(aChar,aToken,aScanner,aFlushTokens);
600        }
601        else {
602          // We are not dealing with a tag. So, don't consume the original
603          // char and leave the decision to ConsumeText().
604          result=ConsumeText(aToken,aScanner);
605        }
606    } //switch
607
608  } //if
609  return result;
610}
611
612/**
613 *  This method is called just after we've consumed a start
614 *  tag, and we now have to consume its attributes.
615 * 
616 *  @update  rickg  03.23.2000
617 *  @param   aChar: last char read
618 *  @param   aScanner: see nsScanner.h
619 *  @param   aLeadingWS: contains ws chars that preceeded the first attribute
620 *  @return 
621 */
622nsresult nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,
623                                            CToken* aToken,
624                                            nsScanner& aScanner) {
625  PRBool done=PR_FALSE;
626  nsresult result=NS_OK;
627  PRInt16 theAttrCount=0;
628
629  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
630
631  while((!done) && (result==NS_OK)) {
632    CAttributeToken* theToken= NS_STATIC_CAST(CAttributeToken*, theAllocator->CreateTokenOfType(eToken_attribute,eHTMLTag_unknown));
633    if(theToken){
634      result=theToken->Consume(aChar,aScanner,mFlags);  //tell new token to finish consuming text...   
635 
636      //Much as I hate to do this, here's some special case code.
637      //This handles the case of empty-tags in XML. Our last
638      //attribute token will come through with a text value of ""
639      //and a textkey of "/". We should destroy it, and tell the
640      //start token it was empty.
641      if(NS_SUCCEEDED(result)) {
642        PRBool isUsableAttr = PR_TRUE;
643        const nsAString& key=theToken->GetKey();
644        const nsAString& text=theToken->GetValue();
645
646         // support XML like syntax to fix bugs like 44186
647        if(!key.IsEmpty() && kForwardSlash==key.First() && text.IsEmpty()) {
648          isUsableAttr = PRBool(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE); // Fix bug 103095
649          aToken->SetEmpty(isUsableAttr);
650        }
651        if(isUsableAttr) {
652          ++theAttrCount;
653          AddToken((CToken*&)theToken,result,&mTokenDeque,theAllocator);
654        }
655        else {
656          IF_FREE(theToken, mTokenAllocator);
657        }
658      }
659      else { //if(NS_ERROR_HTMLPARSER_BADATTRIBUTE==result){
660        aToken->SetEmpty(PR_TRUE);
661        IF_FREE(theToken, mTokenAllocator);
662        if(NS_ERROR_HTMLPARSER_BADATTRIBUTE==result)
663          result=NS_OK;
664      }
665    }//if
666   
667#ifdef DEBUG
668    if(NS_SUCCEEDED(result)){
669      PRInt32 newline = 0;
670      result = aScanner.SkipWhitespace(newline);
671      NS_ASSERTION(newline == 0, "CAttribute::Consume() failed to collect all the newlines!");
672    }
673#endif
674    if (NS_SUCCEEDED(result)) {
675      result = aScanner.Peek(aChar);
676      if (NS_SUCCEEDED(result)) {
677        if (aChar == kGreaterThan) { //you just ate the '>'
678          aScanner.GetChar(aChar); //skip the '>'
679          done = PR_TRUE;
680        }
681        else if(aChar == kLessThan) {
682          done = PR_TRUE;
683        }
684      }//if
685    }//if
686  }//while
687
688  aToken->SetAttributeCount(theAttrCount);
689  return result;
690}
691
692/**
693 *
694 * @update      gess12/28/98
695 * @param
696 * @return
697 */
698nsresult nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner,PRBool& aFlushTokens) {
699  PRInt32 theDequeSize=mTokenDeque.GetSize(); //remember this for later in case you have to unwind...
700  nsresult result=NS_OK;
701
702  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
703  aToken=theAllocator->CreateTokenOfType(eToken_start,eHTMLTag_unknown);
704 
705  if(aToken) {
706    // Save the position after '<' for use in recording traling contents. Ref: Bug. 15204.
707    nsScannerIterator origin;
708    aScanner.CurrentPosition(origin);
709
710    result= aToken->Consume(aChar,aScanner,mFlags);     //tell new token to finish consuming text...   
711
712    if(NS_SUCCEEDED(result)) {
713     
714      AddToken(aToken,result,&mTokenDeque,theAllocator);
715      NS_ENSURE_SUCCESS(result, result);
716
717      eHTMLTags theTag=(eHTMLTags)aToken->GetTypeID();
718
719      //Good. Now, let's see if the next char is ">".
720      //If so, we have a complete tag, otherwise, we have attributes.
721      result = aScanner.Peek(aChar);
722      NS_ENSURE_SUCCESS(result, result);
723
724      if(kGreaterThan != aChar) { //look for '>'
725        result = ConsumeAttributes(aChar, aToken, aScanner);
726      } //if
727      else {
728        aScanner.GetChar(aChar);
729      }       
730
731      /*  Now that that's over with, we have one more problem to solve.
732          In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
733          consume all the content itself.
734       */
735      if(NS_SUCCEEDED(result)) {
736        CStartToken* theStartToken = NS_STATIC_CAST(CStartToken*,aToken);
737        //XXX - Find a better soution to record content
738        //Added _plaintext to fix bug 46054.
739        if(!(mFlags & NS_IPARSER_FLAG_PRESERVE_CONTENT) &&
740           (theTag == eHTMLTag_textarea  ||
741            theTag == eHTMLTag_xmp       ||
742            theTag == eHTMLTag_plaintext ||
743            theTag == eHTMLTag_noscript  ||
744            theTag == eHTMLTag_noframes)) {
745          NS_ASSERTION(mPreserveTarget == eHTMLTag_unknown,
746                       "mPreserveTarget set but not preserving content?");
747          mPreserveTarget = theTag;
748          mFlags |= NS_IPARSER_FLAG_PRESERVE_CONTENT;
749        }
750         
751        if (mFlags & NS_IPARSER_FLAG_PRESERVE_CONTENT)
752          PreserveToken(theStartToken, aScanner, origin);
753       
754        //if((eHTMLTag_style==theTag) || (eHTMLTag_script==theTag)) {
755        if(gHTMLElements[theTag].CanContainType(kCDATA)) {
756          nsAutoString endTagName;
757          endTagName.Assign(nsHTMLTags::GetStringValue(theTag));
758
759          CToken*     text=theAllocator->CreateTokenOfType(eToken_text,eHTMLTag_text);
760          CTextToken* textToken=NS_STATIC_CAST(CTextToken*,text);
761          result=textToken->ConsumeUntil(0,theTag!=eHTMLTag_script,aScanner,endTagName,mFlags,aFlushTokens);  //tell new token to finish consuming text...   
762         
763          // Fix bug 44186
764          // Support XML like syntax, i.e., <script src="external.js"/> == <script src="external.js"></script>
765          // Note: if aFlushTokens is TRUE then we have seen an </script>
766          // We do NOT want to output the end token if we didn't see a
767          // </script> and have a preserve target.  If that happens, then we'd
768          // be messing up the text inside the <textarea> or <xmp> or whatever
769          // it is.
770          if((!(mFlags & NS_IPARSER_FLAG_PRESERVE_CONTENT) &&
771              !theStartToken->IsEmpty()) || aFlushTokens) {
772            theStartToken->SetEmpty(PR_FALSE); // Setting this would make cases like <script/>d.w("text");</script> work.
773            CToken* endToken=theAllocator->CreateTokenOfType(eToken_end,theTag,endTagName);
774            AddToken(text,result,&mTokenDeque,theAllocator);
775            AddToken(endToken,result,&mTokenDeque,theAllocator);
776          }
777          else {
778            IF_FREE(text, mTokenAllocator);
779          }
780        }
781      }
782 
783      //EEEEECCCCKKKK!!!
784      //This code is confusing, so pay attention.
785      //If you're here, it's because we were in the midst of consuming a start
786      //tag but ran out of data (not in the stream, but in this *part* of the stream.
787      //For simplicity, we have to unwind our input. Therefore, we pop and discard
788      //any new tokens we've cued this round. Later we can get smarter about this.
789      if(NS_FAILED(result)) {
790        while(mTokenDeque.GetSize()>theDequeSize) {
791          CToken* theToken=(CToken*)mTokenDeque.Pop();
792          IF_FREE(theToken, mTokenAllocator);
793        }
794      }
795    } //if
796    else IF_FREE(aToken, mTokenAllocator);
797  } //if
798  return result;
799}
800
801/**
802 *
803 * @update      gess12/28/98
804 * @param
805 * @return
806 */
807nsresult nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner) {
808 
809  // Get the "/" (we've already seen it with a Peek)
810  aScanner.GetChar(aChar);
811
812  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
813  aToken=theAllocator->CreateTokenOfType(eToken_end,eHTMLTag_unknown);
814  nsresult result=NS_OK;
815 
816  if(aToken) {
817    result= aToken->Consume(aChar,aScanner,mFlags);  //tell new token to finish consuming text...   
818    AddToken(aToken,result,&mTokenDeque,theAllocator);
819    NS_ENSURE_SUCCESS(result, result);
820     
821    result = aScanner.Peek(aChar);
822    NS_ENSURE_SUCCESS(result, result);
823
824    if(kGreaterThan != aChar) {
825      result = ConsumeAttributes(aChar, aToken, aScanner);
826      NS_ENSURE_SUCCESS(result, result);
827    }
828    else {
829      aScanner.GetChar(aChar);
830    }       
831
832    if (NS_SUCCEEDED(result)) {
833      eHTMLTags theTag = (eHTMLTags)aToken->GetTypeID();
834      if (mPreserveTarget == theTag) {
835        // Target reached. Stop preserving content.
836        mPreserveTarget = eHTMLTag_unknown;
837        mFlags &= ~NS_IPARSER_FLAG_PRESERVE_CONTENT;
838      }
839    }
840  } //if
841  return result;
842}
843
844/**
845 *  This method is called just after a "&" has been consumed
846 *  and we know we're at the start of an entity. 
847 * 
848 *  @update gess 3/25/98
849 *  @param  aChar: last char read
850 *  @param  aScanner: see nsScanner.h
851 *  @param  anErrorCode: arg that will hold error condition
852 *  @return new token or null
853 */
854nsresult nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner) {
855   PRUnichar  theChar;
856   nsresult result=aScanner.Peek(theChar, 1);
857
858  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
859  if (NS_SUCCEEDED(result)) {
860    if (nsCRT::IsAsciiAlpha(theChar) || theChar==kHashsign) {
861      aToken = theAllocator->CreateTokenOfType(eToken_entity,eHTMLTag_entity);
862      result=aToken->Consume(theChar,aScanner,mFlags);
863
864      if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
865        IF_FREE(aToken, mTokenAllocator);
866      }
867      else {
868        if (mIsFinalChunk && result == kEOF) {
869          result=NS_OK; //use as much of the entity as you can get.
870        }
871        AddToken(aToken,result,&mTokenDeque,theAllocator);
872        return result;
873      }
874    }
875    // oops, we're actually looking at plain text...
876    result = ConsumeText(aToken,aScanner);
877  }//if
878  return result;
879}
880
881
882/**
883 *  This method is called just after whitespace has been
884 *  consumed and we know we're at the start a whitespace run. 
885 * 
886 *  @update gess 3/25/98
887 *  @param  aChar: last char read
888 *  @param  aScanner: see nsScanner.h
889 *  @param  anErrorCode: arg that will hold error condition
890 *  @return new token or null
891 */
892nsresult nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner) {
893  // Get the whitespace character
894  aScanner.GetChar(aChar);
895
896  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
897  aToken = theAllocator->CreateTokenOfType(eToken_whitespace,eHTMLTag_whitespace);
898  nsresult result=NS_OK;
899  if(aToken) {
900    result=aToken->Consume(aChar,aScanner,mFlags);
901    AddToken(aToken,result,&mTokenDeque,theAllocator);
902  }
903  return result;
904}
905
906/**
907 *  This method is called just after a "<!" has been consumed
908 *  and we know we're at the start of a comment. 
909 * 
910 *  @update gess 3/25/98
911 *  @param  aChar: last char read
912 *  @param  aScanner: see nsScanner.h
913 *  @param  anErrorCode: arg that will hold error condition
914 *  @return new token or null
915 */
916nsresult nsHTMLTokenizer::ConsumeComment(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner){
917  // Get the "!"
918  aScanner.GetChar(aChar);
919
920  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
921  aToken = theAllocator->CreateTokenOfType(eToken_comment,eHTMLTag_comment);
922  nsresult result=NS_OK;
923  if(aToken) {
924    result=aToken->Consume(aChar,aScanner,mFlags);
925    AddToken(aToken,result,&mTokenDeque,theAllocator);
926  }
927  return result;
928}
929
930/**
931 *  This method is called just after a known text char has
932 *  been consumed and we should read a text run.
933 * 
934 *  @update gess 3/25/98
935 *  @param  aChar: last char read
936 *  @param  aScanner: see nsScanner.h
937 *  @param  anErrorCode: arg that will hold error condition
938 *  @return new token or null
939 */
940nsresult nsHTMLTokenizer::ConsumeText(CToken*& aToken,nsScanner& aScanner){
941  nsresult result=NS_OK;
942  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
943  CTextToken* theToken = (CTextToken*)theAllocator->CreateTokenOfType(eToken_text,eHTMLTag_text);
944  if(theToken) {
945    PRUnichar ch=0;
946    result=theToken->Consume(ch,aScanner,mFlags);
947    if(NS_FAILED(result)) {
948      if(0==theToken->GetTextLength()){
949        IF_FREE(aToken, mTokenAllocator);
950        aToken = nsnull;
951      }
952      else result=NS_OK;
953    }
954    aToken = theToken;
955    AddToken(aToken,result,&mTokenDeque,theAllocator);
956  }
957  return result;
958}
959
960/**
961 *  This method is called just after a "<!" has been consumed.
962 *  NOTE: Here we might consume DOCTYPE and "special" markups.
963 *
964 * 
965 *  @update harishd 09/02/99
966 *  @param  aChar: last char read
967 *  @param  aScanner: see nsScanner.h
968 *  @param  anErrorCode: arg that will hold error condition
969 *  @return new token or null
970 */
971nsresult nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner){
972
973  // Get the "!"
974  aScanner.GetChar(aChar);
975
976  nsresult result=NS_OK;
977  nsAutoString theBufCopy;
978  aScanner.Peek(theBufCopy, 20);
979  ToUpperCase(theBufCopy);
980  PRInt32 theIndex=theBufCopy.Find("DOCTYPE");
981  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
982 
983  if(theIndex==kNotFound) {
984    if('['==theBufCopy.CharAt(0)) {
985      aToken = theAllocator->CreateTokenOfType(eToken_cdatasection,eHTMLTag_comment); 
986    } else if (StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ELEMENT")) ||
987               StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ATTLIST")) ||
988               StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ENTITY")) ||
989               StringBeginsWith(theBufCopy, NS_LITERAL_STRING("NOTATION"))) {
990      aToken = theAllocator->CreateTokenOfType(eToken_markupDecl,eHTMLTag_markupDecl);
991    } else {
992      aToken = theAllocator->CreateTokenOfType(eToken_comment,eHTMLTag_comment);
993    }
994  }
995  else
996    aToken = theAllocator->CreateTokenOfType(eToken_doctypeDecl,eHTMLTag_doctypeDecl);
997 
998  if(aToken) {
999    result=aToken->Consume(aChar,aScanner,mFlags);
1000    AddToken(aToken,result,&mTokenDeque,theAllocator);
1001  }
1002  return result;
1003}
1004
1005/**
1006 *  This method is called just after a newline has been consumed.
1007 * 
1008 *  @update gess 3/25/98
1009 *  @param  aChar: last char read
1010 *  @param  aScanner: see nsScanner.h
1011 *  @param  aToken is the newly created newline token that is parsing
1012 *  @return error code
1013 */
1014nsresult nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner){
1015  // Get the newline character
1016  aScanner.GetChar(aChar);
1017
1018  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
1019  aToken=theAllocator->CreateTokenOfType(eToken_newline,eHTMLTag_newline);
1020  nsresult result=NS_OK;
1021  if(aToken) {
1022    result=aToken->Consume(aChar,aScanner,mFlags);
1023    AddToken(aToken,result,&mTokenDeque,theAllocator);
1024  }
1025  return result;
1026}
1027
1028
1029/**
1030 *  This method is called just after a ? has been consumed.
1031 * 
1032 *  @update gess 3/25/98
1033 *  @param  aChar: last char read
1034 *  @param  aScanner: see nsScanner.h
1035 *  @param  aToken is the newly created newline token that is parsing
1036 *  @return error code
1037 */
1038nsresult nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner){
1039 
1040  // Get the "?"
1041  aScanner.GetChar(aChar);
1042
1043  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
1044  aToken=theAllocator->CreateTokenOfType(eToken_instruction,eHTMLTag_unknown);
1045  nsresult result=NS_OK;
1046  if(aToken) {
1047    result=aToken->Consume(aChar,aScanner,mFlags);
1048    AddToken(aToken,result,&mTokenDeque,theAllocator);
1049  }
1050  return result;
1051}
1052
1053/**
1054 *  This method keeps a copy of contents within the start token.
1055 *  The stored content could later be used in displaying TEXTAREA,
1056 *  and also in view source.
1057 * 
1058 *  @update harishd 11/09/99
1059 *  @param  aStartToken: The token whose trailing contents are to be recorded
1060 *  @param  aScanner: see nsScanner.h
1061 * 
1062 */
1063
1064void nsHTMLTokenizer::PreserveToken(CStartToken* aStartToken,
1065                                    nsScanner& aScanner,
1066                                    nsScannerIterator aOrigin) {
1067  if(aStartToken) {
1068    nsScannerIterator theCurrentPosition;
1069    aScanner.CurrentPosition(theCurrentPosition);
1070
1071    nsString& trailingContent = aStartToken->mTrailingContent;
1072    PRUint32 oldLength = trailingContent.Length();
1073    trailingContent.SetLength(oldLength + Distance(aOrigin, theCurrentPosition));
1074
1075    nsWritingIterator<PRUnichar> beginWriting;
1076    trailingContent.BeginWriting(beginWriting);
1077    beginWriting.advance(oldLength);
1078
1079    copy_string( aOrigin, theCurrentPosition, beginWriting );
1080  }
1081}
Note: See TracBrowser for help on using the repository browser.