U v_,@sddlmZmZmZddlmZddlmZm Z ddl m Z ddl m Z ddl mZddl mZmZdd l mZmZmZdd l mZmZdd l mZdd lmZdd lmZeeZe dkreZne ZGdddeZdS))absolute_importdivisionunicode_literals)unichr)deque OrderedDict) version_info)spaceCharacters)entities) asciiLettersasciiUpper2Lower)digits hexDigitsEOF) tokenTypes tagTokenTypes)replacementCharacters)HTMLInputStream)Trie)csdeZdZdZdfdd ZddZddZdd d Zd d ZddZ ddZ ddZ ddZ ddZ ddZddZddZddZd d!Zd"d#Zd$d%Zd&d'Zd(d)Zd*d+Zd,d-Zd.d/Zd0d1Zd2d3Zd4d5Zd6d7Zd8d9Zd:d;Zdd?Z!d@dAZ"dBdCZ#dDdEZ$dFdGZ%dHdIZ&dJdKZ'dLdMZ(dNdOZ)dPdQZ*dRdSZ+dTdUZ,dVdWZ-dXdYZ.dZd[Z/d\d]Z0d^d_Z1d`daZ2dbdcZ3dddeZ4dfdgZ5dhdiZ6djdkZ7dldmZ8dndoZ9dpdqZ:drdsZ;dtduZdzd{Z?d|d}Z@d~dZAddZBddZCddZDddZEddZFddZGddZHddZIddZJddZKddZLZMS) HTMLTokenizera  This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. Nc sFt|f||_||_d|_g|_|j|_d|_d|_t t | dS)NF) rstreamparser escapeFlag lastFourChars dataStatestateescape currentTokensuperr__init__)selfrrkwargs __class__/builddir/build/BUILDROOT/alt-python38-pip-20.2.4-1.el7.x86_64/opt/alt/python38/lib/python3.8/site-packages/pip/_vendor/html5lib/_tokenizer.pyr"(szHTMLTokenizer.__init__ccsPtg|_|rL|jjr6td|jjddVq|jr |jVq6q dS)z This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. ParseErrorrtypedataN)r tokenQueuerrerrorsrpoppopleftr#r'r'r(__iter__7s  zHTMLTokenizer.__iter__c %Cst}d}|rt}d}g}|j}||krH|tk rH|||j}q"td||}|tkrt|}|j t ddd|idnbd|krd ksn|d krd }|j t ddd|idn d |krd ksnd|krdksnd|krdksnd|kr,dksn|t ddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d g#kr|j t ddd|idz t |}Wn>t k r|d6}t d|d?Bt d7|d8@B}YnX|d9kr|j t dd:d;|j||S)r)z'expected-tag-name-but-got-right-bracketr*rXz<>?z'expected-tag-name-but-got-question-markzexpected-tag-namerQT)rrAmarkupDeclarationOpenStatercloseTagOpenStater rr tagNameStater-rBrrHbogusCommentStaterr'r'r(ruws@           zHTMLTokenizer.tagOpenStatecCs|j}|tkr0td|gdd|_|j|_n|dkrX|jtddd|j |_nn|t kr|jtddd|jtd d d|j |_n0|jtdd d |id |j ||j |_dS)NriFr+rgr,rjrr)z*expected-closing-tag-but-got-right-bracketr*z expected-closing-tag-but-got-eofrX|tkr|jtdd d|j |_n|jtd|dd S NrrXr*rQrqr)rrr9eof-in-script-in-scriptT) rrAr-rBr scriptDataDoubleEscapedDashStater(scriptDataDoubleEscapedLessThanSignStaterrrr'r'r(rs*        z*HTMLTokenizer.scriptDataDoubleEscapedStatecCs|j}|dkr2|jtddd|j|_n|dkrZ|jtddd|j|_n|dkr|jtddd|jtddd|j|_nF|t kr|jtdd d|j |_n|jtd|d|j|_d Sr) rrAr-rBr$scriptDataDoubleEscapedDashDashStaterrrrrrr'r'r(rs.        z.HTMLTokenizer.scriptDataDoubleEscapedDashStatecCs|j}|dkr*|jtdddn|dkrR|jtddd|j|_n|dkrz|jtddd|j|_n|dkr|jtddd|jtdd d|j|_nF|t kr|jtdd d|j |_n|jtd|d|j|_d S) NrrXr*rQrrqr)rrr9rT) rrAr-rBrrrrrrrrr'r'r(r%s2        z2HTMLTokenizer.scriptDataDoubleEscapedDashDashStatecCsP|j}|dkr8|jtdddd|_|j|_n|j||j |_dS)NrrXr*r5T) rrAr-rBrrscriptDataDoubleEscapeEndStaterrHrrr'r'r(r>s   z6HTMLTokenizer.scriptDataDoubleEscapedLessThanSignStatecCs|j}|ttdBkrR|jtd|d|jdkrH|j |_ q|j |_ nB|t kr|jtd|d|j|7_n|j ||j |_ dSr)rrAr rEr-rBrrrrrrr rHrr'r'r(rIs    z,HTMLTokenizer.scriptDataDoubleEscapeEndStatecCs0|j}|tkr$|jtdn|tkrJ|jd|dg|j|_n|dkr\| n|dkrn|j |_n|dkr|j t ddd |jd|dg|j|_n|d kr|j t dd d |jdd dg|j|_nF|t kr|j t dd d |j|_n|jd|dg|j|_dS)NTr,r5rr)'"rVrQr)#invalid-character-in-attribute-namer*rqrrr9z#expected-attribute-name-but-got-eof)rrAr rvr r rBattributeNameStaterrprr-rrrrr'r'r(rYs<           z&HTMLTokenizer.beforeAttributeNameStatecCs|j}d}d}|dkr&|j|_n.|tkr\|jddd||jtd7<d}n|dkrjd}n|tkr||j|_n|dkr|j |_n|d kr|j t d d d |jdddd 7<d}n|dkr |j t d dd |jddd|7<d}nH|t kr6|j t d dd |j|_n|jddd|7<d}|r|jdddt|jddd<|jdddD]>\}}|jddd|kr|j t d dd qҐq|r|dS)NTFrVr,rSrrrrqr)rrr*r9rrrQrzeof-in-attribute-namezduplicate-attribute)rrAbeforeAttributeValueStaterr r rvr afterAttributeNameStaterr-rBrrrrkr rp)r#r,leavingThisState emitTokenrg_r'r'r(rws^             z HTMLTokenizer.attributeNameStatecCsD|j}|tkr$|jtdn|dkr8|j|_n|dkrJ|n|tkrp|jd |dg|j |_n|dkr|j |_n|dkr|j t dd d |jd d dg|j |_n|d kr|j t dd d |jd |dg|j |_nF|tkr$|j t ddd |j|_n|jd |dg|j |_dS)NTrVrr,r5rrqr)rrr*r9rz&invalid-character-after-attribute-namezexpected-end-of-tag-but-got-eof)rrAr rvrrrpr r rBrrr-rrrrr'r'r(rs@            z%HTMLTokenizer.afterAttributeNameStatecCsh|j}|tkr$|jtdn@|dkr8|j|_n,|dkrX|j|_|j|n |dkrj|j|_n|dkr|j t ddd| n|d kr|j t dd d|j d d d d7<|j|_n|dkr|j t ddd|j d d d |7<|j|_nL|tkrB|j t ddd|j|_n"|j d d d |7<|j|_dS)NTrrPrrr)z.expected-attribute-value-but-got-right-bracketr*rqrrr,rSr r9)rVrQ`z"equals-in-unquoted-attribute-valuez$expected-attribute-value-but-got-eof)rrAr rvattributeValueDoubleQuotedStaterattributeValueUnQuotedStaterHattributeValueSingleQuotedStater-rBrrpr rrrr'r'r(rsF             z'HTMLTokenizer.beforeAttributeValueStatecCs|j}|dkr|j|_n|dkr0|dn|dkrj|jtddd|jddd d 7<nN|t kr|jtdd d|j |_n&|jddd ||j d 7<d S)NrrPrqr)rrr*r,rSr r9z#eof-in-attribute-value-double-quote)rrPrqT rrAafterAttributeValueStaterrfr-rBrr rrrvrr'r'r(rs&       z-HTMLTokenizer.attributeValueDoubleQuotedStatecCs|j}|dkr|j|_n|dkr0|dn|dkrj|jtddd|jddd d 7<nN|t kr|jtdd d|j |_n&|jddd ||j d 7<d S)NrrPrqr)rrr*r,rSr r9z#eof-in-attribute-value-single-quote)rrPrqTrrr'r'r(rs&       z-HTMLTokenizer.attributeValueSingleQuotedStatecCs|j}|tkr|j|_n|dkr0|dn|dkrB|n|dkr||jt ddd|j ddd |7<n|d kr|jt dd d|j ddd d 7<nV|t kr|jt dd d|j |_n.|j ddd ||j tdtB7<dS)NrPr)rrrVrQrr)z0unexpected-character-in-unquoted-attribute-valuer*r,rSr rqrrr9z eof-in-attribute-value-no-quotes)rPrrrrVrQrrqT)rrAr rrrfrpr-rBrr rrrvrErr'r'r(rs4         z)HTMLTokenizer.attributeValueUnQuotedStatecCs|j}|tkr|j|_n|dkr.|np|dkr@|j|_n^|tkrt|j t ddd|j ||j |_n*|j t ddd|j ||j|_dS)Nrrr)z$unexpected-EOF-after-attribute-valuer*z*unexpected-character-after-attribute-valueT) rrAr rrrprrr-rBrrHrrr'r'r(r.s&         z&HTMLTokenizer.afterAttributeValueStatecCs|j}|dkr&d|jd<|n^|tkrZ|jtddd|j||j |_ n*|jtddd|j||j |_ dS)NrTrjr)z#unexpected-EOF-after-solidus-in-tagr*z)unexpected-character-after-solidus-in-tag) rrAr rprr-rBrrHrrrrr'r'r(rBs         z&HTMLTokenizer.selfClosingStartTagStatecCsD|jd}|dd}|jtd|d|j|j|_dS)Nrrqr9Commentr*T) rrvreplacer-rBrrArrrr'r'r(rTs    zHTMLTokenizer.bogusCommentStatecCs|jg}|ddkrR||j|ddkrPtddd|_|j|_dSn|ddkrd}dD](}||j|d|krfd }qqf|rtd ddddd |_|j|_dSn|dd krD|jdk rD|jj j rD|jj j dj |jj j krDd}d D].}||j|d|krd }q2q|rD|j |_dS|jtddd|rt|j|qZ|j|_dS)NrSrrr5r*T)dD))oOrMCtTyYpPeEFDoctype)r+rgpublicIdsystemIdcorrect[)rrArrrr)zexpected-dashes-or-doctype)rrArBrr commentStartStater doctypeStatertree openElements namespacedefaultNamespacecdataSectionStater-rHr/r)r#rLmatchedexpectedr'r'r(rcs\       z(HTMLTokenizer.markupDeclarationOpenStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd d|j|j|j|_nP|t kr|jtdd d|j|j|j|_n|jd|7<|j |_d S) Nrrqr)rrr*r,r9rincorrect-commenteof-in-commentT) rrAcommentStartDashStaterr-rBrr rr commentStaterr'r'r(rs.       zHTMLTokenizer.commentStartStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd d|j|j|j|_nT|t kr|jtdd d|j|j|j|_n|jdd|7<|j |_d S) Nrrqr)rrr*r,-�rrrT) rrAcommentEndStaterr-rBrr rrrrr'r'r(rs.       z#HTMLTokenizer.commentStartDashStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<nT|tkr|jtddd|j|j|j |_n|jd||j d 7<d S) Nrrqr)rrr*r,r9r)rrqT) rrAcommentEndDashStaterr-rBrr rrrvrr'r'r(rs$       zHTMLTokenizer.commentStatecCs|j}|dkr|j|_n|dkrV|jtddd|jdd7<|j|_nT|t kr|jtddd|j|j|j |_n|jdd|7<|j|_d S) Nrrqr)rrr*r,rzeof-in-comment-end-dashT) rrArrr-rBrr rrrrr'r'r(rs$      z!HTMLTokenizer.commentEndDashStatecCs,|j}|dkr*|j|j|j|_n|dkrd|jtddd|jdd7<|j|_n|dkr|jtdd d|j |_n|d kr|jtdd d|jd|7<nj|t kr|jtdd d|j|j|j|_n4|jtdd d|jdd|7<|j|_dS)Nrrqr)rrr*r,u--�rz,unexpected-bang-after-double-dash-in-commentrz,unexpected-dash-after-double-dash-in-commentzeof-in-comment-double-dashzunexpected-char-in-commentz--T) rrAr-rBr rrrrcommentEndBangStaterrr'r'r(rs@          zHTMLTokenizer.commentEndStatecCs|j}|dkr*|j|j|j|_n|dkrN|jdd7<|j|_n|dkr|jtddd|jdd 7<|j |_nT|t kr|jtdd d|j|j|j|_n|jdd|7<|j |_d S) Nrrr,z--!rqr)rrr*u--!�zeof-in-comment-end-bang-stateT) rrAr-rBr rrrrrrrr'r'r(rs,       z!HTMLTokenizer.commentEndBangStatecCs|j}|tkr|j|_nj|tkr\|jtdddd|j d<|j|j |j |_n*|jtddd|j ||j|_dS)Nr)!expected-doctype-name-but-got-eofr*Frzneed-space-after-doctypeT) rrAr beforeDoctypeNameStaterrr-rBrr rrHrr'r'r(rs        zHTMLTokenizer.doctypeStatecCs|j}|tkrn|dkrT|jtdddd|jd<|j|j|j|_n|dkr|jtdddd |jd <|j |_nR|t kr|jtdd dd|jd<|j|j|j|_n||jd <|j |_d S) Nrr)z+expected-doctype-name-but-got-right-bracketr*Frrqrrr9rgrT) rrAr r-rBrr rrdoctypeNameStaterrr'r'r(r*s4           z$HTMLTokenizer.beforeDoctypeNameStatecCs|j}|tkr2|jdt|jd<|j|_n|dkrh|jdt|jd<|j |j|j |_n|dkr|j t ddd|jdd7<|j |_nh|t kr|j t dddd |jd <|jdt|jd<|j |j|j |_n|jd|7<d S) Nrgrrqr)rrr*r9zeof-in-doctype-nameFrT)rrAr r rkr afterDoctypeNameStaterr-rBrrrrrr'r'r(rDs0        zHTMLTokenizer.doctypeNameStatecCsH|j}|tkrn.|dkr8|j|j|j|_n |tkrd|jd<|j ||jt ddd|j|j|j|_n|dkrd}d D]}|j}||krd}qq|r|j |_dSnD|d kr d}d D]}|j}||krd}qq|r |j |_dS|j ||jt dd d |idd|jd<|j |_dS)NrFrr)eof-in-doctyper*rT))uU)bB)lL)iIrsS)rrrr)mMz*expected-space-or-right-bracket-in-doctyper,r7)rrAr r-rBr rrrrHrafterDoctypePublicKeywordStateafterDoctypeSystemKeywordStatebogusDoctypeState)r#r,rrr'r'r(r]sT            z#HTMLTokenizer.afterDoctypeNameStatecCs|j}|tkr|j|_n|dkrP|jtddd|j||j|_nT|t kr|jtdddd|j d<|j|j |j |_n|j||j|_dS N)rrr)unexpected-char-in-doctyper*rFrT) rrAr "beforeDoctypePublicIdentifierStaterr-rBrrHrr rrr'r'r(rs&         z,HTMLTokenizer.afterDoctypePublicKeywordStatecCs|j}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jt dddd |jd <|j|j|j |_nh|t kr|jt dd dd |jd <|j|j|j |_n(|jt dd dd |jd <|j |_d S)Nrr5rrrr)unexpected-end-of-doctyper*FrrrT) rrAr r (doctypePublicIdentifierDoubleQuotedStater(doctypePublicIdentifierSingleQuotedStater-rBrrrrrr'r'r(rs:             z0HTMLTokenizer.beforeDoctypePublicIdentifierStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrrqr)rrr*rr9rrFrrT rrA!afterDoctypePublicIdentifierStaterr-rBrr rrrr'r'r(rs0         z6HTMLTokenizer.doctypePublicIdentifierDoubleQuotedStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrrqr)rrr*rr9rrFrrTrrr'r'r(rs0         z6HTMLTokenizer.doctypePublicIdentifierSingleQuotedStatecCs |j}|tkr|j|_n|dkr<|j|j|j|_n|dkrn|jt dddd|jd<|j |_n|dkr|jt dddd|jd<|j |_nh|t kr|jt dd dd |jd <|j|j|j|_n(|jt dddd |jd <|j |_d S) Nrrr)rr*r5rrrFrT)rrAr -betweenDoctypePublicAndSystemIdentifiersStaterr-rBr rr(doctypeSystemIdentifierDoubleQuotedState(doctypeSystemIdentifierSingleQuotedStaterrrr'r'r(r s>              z/HTMLTokenizer.afterDoctypePublicIdentifierStatecCs|j}|tkrn|dkr4|j|j|j|_n|dkrPd|jd<|j|_n|dkrld|jd<|j |_nh|t kr|jt dddd |jd <|j|j|j|_n(|jt dd dd |jd <|j |_d S) Nrrr5rrr)rr*FrrT) rrAr r-rBr rrr r rrrrr'r'r(r s2           z;HTMLTokenizer.betweenDoctypePublicAndSystemIdentifiersStatecCs|j}|tkr|j|_n|dkrP|jtddd|j||j|_nT|t kr|jtdddd|j d<|j|j |j |_n|j||j|_dSr) rrAr "beforeDoctypeSystemIdentifierStaterr-rBrrHrr rrr'r'r(r)s&         z,HTMLTokenizer.afterDoctypeSystemKeywordStatecCs|j}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jt dddd |jd <|j|j|j |_nh|t kr|jt dd dd |jd <|j|j|j |_n(|jt dddd |jd <|j |_d S) Nrr5rrrr)rr*FrrT) rrAr r r rr r-rBrrrrrr'r'r(r =s:             z0HTMLTokenizer.beforeDoctypeSystemIdentifierStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrrqr)rrr*rr9rrFrrT rrA!afterDoctypeSystemIdentifierStaterr-rBrr rrrr'r'r(r Zs0         z6HTMLTokenizer.doctypeSystemIdentifierDoubleQuotedStatecCs|j}|dkr|j|_n|dkrN|jtddd|jdd7<n|dkr|jtdd dd |jd <|j|j|j|_nR|t kr|jtdd dd |jd <|j|j|j|_n|jd|7<d S)Nrrqr)rrr*rr9rrFrrTrrr'r'r(r rs0         z6HTMLTokenizer.doctypeSystemIdentifierSingleQuotedStatecCs|j}|tkrn~|dkr4|j|j|j|_n^|tkrt|jt dddd|jd<|j|j|j|_n|jt ddd|j |_dS) Nrr)rr*FrrT) rrAr r-rBr rrrrrrr'r'r(rs$      z/HTMLTokenizer.afterDoctypeSystemIdentifierStatecCsZ|j}|dkr*|j|j|j|_n,|tkrV|j||j|j|j|_ndS)NrT) rrAr-rBr rrrrHrr'r'r(rs    zHTMLTokenizer.bogusDoctypeStatecCsg}||jd||jd|j}|tkr>qq|dksJt|ddddkrv|ddd|d<qq||qd|}|d}|dkrt|D]}|j t d d d q| dd }|r|j t d |d |j |_ dS)N]rrSz]]r5rqrr)rrr*r9rXT)rBrrvrArAssertionErrorrDcountranger-rrrr)r#r,rA nullCountrr'r'r(rs2          zHTMLTokenizer.cdataSectionState)N)NF)N__name__ __module__ __qualname____doc__r"r2rOrerfrprrtr|rzrrrrurrr{rrr~rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr r rr r r rrr __classcell__r'r'r%r(rs H P#         6 "-3rN) __future__rrrZpip._vendor.sixrrF collectionsrrsysr constantsr r r r rrrrrr _inputstreamr_trierrYdictrlobjectrr'r'r'r(s