idD ddlmZddlmZmZddlmZmZddlm Z m Z m Z m Z m Z mZmZmZmZmZmZmZmZmZmZmZGddZGdd eZGd d eZGd d eZGddeZGddeZGddeZGddeZ GddeZ!eddee"dee"de#fdZ$ed d&d!e"d"e%d#e#de%fd$Z&d%S)') lru_cache)ListOptional)COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjk is_emoticon is_hangul is_hiragana is_katakanais_latinis_punctuation is_separator is_symbolis_thaiis_unprintable remove_accent unicode_rangecVeZdZdZdedefdZdeddfdZd dZe de fdZ dS) MessDetectorPluginzy Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. characterreturnct)z@ Determine if given character should be fed in. NotImplementedErrorselfrs J/opt/cloudlinux/venv/lib/python3.11/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible "!Nct)z The main routine to be executed upon character. Insert the logic in witch the text would be considered chaotic. rr s r"feedzMessDetectorPlugin.feed%s "!r%ct)zB Permit to reset the plugin to the initial state. rr!s r"resetzMessDetectorPlugin.reset,r$r%ct)z Compute the chaos ratio based on what your feed() has seen. Must NOT be lower than 0.; No restriction gt 0. rr)s r"ratiozMessDetectorPlugin.ratio2s "!r%rN) __name__ __module__ __qualname____doc__strboolr#r'r*propertyfloatr,r%r"rrs "#"$"""" "c"d"""""""" "u"""X"""r%rcZeZdZd dZdedefdZdeddfdZd dZe de fdZ dS) TooManySymbolOrPunctuationPluginrNcLd|_d|_d|_d|_d|_dS)NrF)_punctuation_count _symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr)s r"__init__z)TooManySymbolOrPunctuationPlugin.__init__<s0'("#%&37!,1###r%rc*|SN isprintabler s r"r#z)TooManySymbolOrPunctuationPlugin.eligibleD$$&&&r%c(|xjdz c_||jkro|tvrft|r|xjdz c_nF|dur0t |r!t|dur|xjdz c_||_dS)NrF) r<r=rrr:isdigitrr r;r s r"r'z%TooManySymbolOrPunctuationPlugin.feedGs " 2 2 2!===i(( (''1,'''!!##u,,i((- **e33""a'""$-!!!r%c0d|_d|_d|_dSNr)r:r<r;r)s r"r*z&TooManySymbolOrPunctuationPlugin.resetYs "# !r%c^|jdkrdS|j|jz|jz }|dkr|ndS)Nrg333333?)r<r:r;)r!ratio_of_punctuations r"r,z&TooManySymbolOrPunctuationPlugin.ratio^sK  A % %3  #d&8 8  !'"(AV'V(=(E(E$$3Nr%r-rMr6r%r"rOrOjs))))###$####)c)d)))) $$$$OuOOOXOOOr%rOcZeZdZd dZdedefdZdeddfdZd dZe de fdZ dS) UnprintablePluginrNc"d|_d|_dSrI)_unprintable_countr<r)s r"r?zUnprintablePlugin.__init__s'(%&r%rcdSNTr6r s r"r#zUnprintablePlugin.eligibletr%cdt|r|xjdz c_|xjdz c_dSrV)rr]r<r s r"r'zUnprintablePlugin.feeds@ ) $ $ )  # #q ( # # "r%cd|_dSrI)r]r)s r"r*zUnprintablePlugin.resets"#r%c@|jdkrdS|jdz|jz S)NrrK)r<r]r)s r"r,zUnprintablePlugin.ratios+  A % %3'!+t/DDDr%r-rMr6r%r"r[r[s''''#$#c#d#### $$$$EuEEEXEEEr%r[cZeZdZd dZdedefdZdeddfdZd dZe de fdZ dS) SuspiciousDuplicateAccentPluginrNc0d|_d|_d|_dSrI_successive_countr<_last_latin_characterr)s r"r?z(SuspiciousDuplicateAccentPlugin.__init__s &'%&48"""r%rcH|ot|SrA)rTrr s r"r#z(SuspiciousDuplicateAccentPlugin.eligibles!  "":x ':'::r%cl|xjdz c_|jt|rt|jrr|r)|jr|xjdz c_t |t |jkr|xjdz c_||_dSrV)r<rjr isupperrirr s r"r'z$SuspiciousDuplicateAccentPlugin.feeds "  & 2y)) 3t9:: 3  "" ,t'A'I'I'K'K ,&&!+&&Y''=9S+T+TTT&&!+&&%."""r%c0d|_d|_d|_dSrIrhr)s r"r*z%SuspiciousDuplicateAccentPlugin.resets !" !%)"""r%c@|jdkrdS|jdz|jz S)NrrKrF)r<rir)s r"r,z%SuspiciousDuplicateAccentPlugin.ratios+  A % %3&*d.CCCr%r-rMr6r%r"rfrfs9999 ;#;$;;;; /c /d / / / /**** DuDDDXDDDr%rfcZeZdZd dZdedefdZdeddfdZd dZe de fdZ dS) SuspiciousRangerNc0d|_d|_d|_dSrI)"_suspicious_successive_range_countr<_last_printable_seenr)s r"r?zSuspiciousRange.__init__s 78/%&37!!!r%rc*|SrArBr s r"r#zSuspiciousRange.eligiblerDr%cD|xjdz c_|st|s |tvr d|_dS|j ||_dSt |j}t |}t ||r|xjdz c_||_dSrV)r<isspacerrrtr is_suspiciously_successive_rangers)r!runicode_range_aunicode_range_bs r"r'zSuspiciousRange.feeds "      i(( 888(,D % F  $ ,(1D % F)6t7P)Q)Q)6y)A)A +O_ M M 9  3 3q 8 3 3$-!!!r%c0d|_d|_d|_dSrI)r<rsrtr)s r"r*zSuspiciousRange.resets !23/$(!!!r%cT|jdkrdS|jdz|jz }|dkrdS|S)NrrKrFg?)r<rs)r!ratio_of_suspicious_range_usages r"r,zSuspiciousRange.ratiosH  A % %3  3a 7  !2"' +S 0 03..r%r-rMr6r%r"rqrqs8888 '#'$''''.c.d.....))))  /u / / /X / / /r%rqcZeZdZd dZdedefdZdeddfdZd dZe de fdZ dS) SuperWeirdWordPluginrNcd|_d|_d|_d|_d|_d|_d|_d|_d|_dS)NrF) _word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr<_bad_character_count_buffer_buffer_accent_countr)s r"r?zSuperWeirdWordPlugin.__init__sO !$%() */!). %&)*! )*!!!r%rcdSr_r6r s r"r#zSuperWeirdWordPlugin.eligibler`r%c|r|xj|z c_t|r|xjdz c_|jdur|t |dust|r\t |durKt|dur:t|dur)t|durt|durd|_dS|jsdS| st|st|r"|jr|xjdz c_t|j}|xj|z c_|dkre|j|z dkrd|_t|jdr6|jdr|xjdz c_d|_|dkr|jr|xjdz c_d|_|jr9|xjdz c_|xjt|jz c_d|_d|_d|_d |_dS|d vr>|dur*t/|rd|_|xj|z c_dSdSdSdS) NrFTg(\?rr>_-<=>|~)rTrr rrrr rrrrrwrrrlenr<rrmrrrrGr)r!r buffer_lengths r"r'zSuperWeirdWordPlugin.feeds       LLI %LLi(( /))Q.))(E11i((E11^I5N5N19%%..i((E11 **e33 **e33I&&%//+/( F|  F     " )#<#<" &@LY@W@W" &l" &    !  !$T\!2!2M  ! !] 2 ! !!!,}->>)),1)',D $DL()D % % % @ @ @!!##u,,)$$-)-D % LLI %LLLL A @,,,,r%cvd|_d|_d|_d|_d|_d|_d|_d|_dS)NrFr)rrrrrr<rrr)s r"r*zSuperWeirdWordPlugin.reset=sG $)!#(   !$%!#$   r%cP|jdkr |jdkrdS|j|jz S)N rrK)rrrr<r)s r"r,zSuperWeirdWordPlugin.ratioGs3  r ! !d&>!&C&C3(4+@@@r%r-rMr6r%r"rrs + + + +#$4&c4&d4&4&4&4&l%%%%AuAAAXAAAr%rc^eZdZdZd dZdedefdZdeddfdZd dZ e de fd Z dS) CjkInvalidStopPluginu GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. Searching for the overuse of '丅' and '丄'. rNc"d|_d|_dSrI_wrong_stop_count_cjk_character_countr)s r"r?zCjkInvalidStopPlugin.__init__Us&')*!!!r%rcdSr_r6r s r"r#zCjkInvalidStopPlugin.eligibleYr`r%ct|dvr|xjdz c_dSt|r|xjdz c_dSdS)N>丄丅r)rr rr s r"r'zCjkInvalidStopPlugin.feed\sZ  & &  " "a ' " " F )   +  % % * % % % % + +r%c"d|_d|_dSrIrr)s r"r*zCjkInvalidStopPlugin.resetcs!"$%!!!r%c:|jdkrdS|j|jz S)NrK)rrr)s r"r,zCjkInvalidStopPlugin.ratiogs&  $r ) )3%(AAAr%r-) r.r/r0r1r?r2r3r#r'r*r4r5r,r6r%r"rrOs ++++#$+c+d++++&&&&BuBBBXBBBr%rcZeZdZd dZdedefdZdeddfdZd dZe de fdZ dS) ArchaicUpperLowerPluginrNchd|_d|_d|_d|_d|_d|_d|_dS)NFrT)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr<_last_alpha_seen_current_ascii_onlyr)s r"r?z ArchaicUpperLowerPlugin.__init__os? 45,23*890%&/3)-   r%rcdSr_r6r s r"r#z ArchaicUpperLowerPlugin.eligible|r`r%c|ot|}|du}|r|jdkrt|jdkr4|dur|jdur|xj|jz c_d|_d|_d|_d|_|xj dz c_ d|_dS|jdurt|durd|_|j| r|j s-| rB|j r)|jdur|xjdz c_d|_nd|_nd|_|xj dz c_ |xjdz c_||_dS)NFr@rTrF) rTr rrGrrrrrr<r rmislower)r!r is_concerned chunk_seps r"r'zArchaicUpperLowerPlugin.feeds ((**J/? /J/J  E)  =AA4::%%''500,558868823D .34D 0$(D !DI  ! !Q & ! !'+D $ F  #t + +0C0Cu0L0L',D $  ,!!## "(=(E(E(G(G "!!## "(,(=(E(E(G(G "9$$66!;66 %DII $DII!  " ,,1,, )r%chd|_d|_d|_d|_d|_d|_d|_dS)NrFT)r<rrrrrrr)s r"r*zArchaicUpperLowerPlugin.resets? !/0,-.*340 $ #'   r%c:|jdkrdS|j|jz S)NrrK)r<rr)s r"r,zArchaicUpperLowerPlugin.ratios&  A % %37$:OOOr%r-rMr6r%r"rrns . . . .#$(*c(*d(*(*(*(*T((((PuPPPXPPPr%r)maxsizeryrzrc||dS||krdSd|vrd|vrdSd|vsd|vrdSd|vsd|vr d|vsd|vrdS|d|d}}|D]}|tvr ||vrdS|dv|dv}}|s|r d |vsd |vrdS|r|rdSd |vsd |vrd |vsd |vrdS|d ks|d krdSd |vs d |vs|d vr|d vrd |vsd |vrdSd|vsd|vrdSdS)za Determine if two Unicode range seen next to each other can be considered as suspicious. NTFLatin Emoticons Combining )HiraganaKatakanaCJKHangulz Basic Latin)rr PunctuationForms)splitr)ryrzkeywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss r"rxrxs/"9t/))u/!!g&@&@uo%%)G)Gu ?""g&@&@&&+*H*Hu)8)>)> **S!!' 0 0 0  ! ! !55 "   33 ' ,   E_$<$<u,u?""h/&A&A O # #u'?'?5 m + +-/O/O5   E_$<$<333 7 7 7 O + +}/O/O5 o % %O)C)C5 4r%i皙?Fdecoded_sequencemaximum_thresholddebugcdtD}t|dz}d}|dkrd}n |dkrd}nd}t|d zt |D]m\}}|D],} | |r| |-|d kr ||zd ks ||dz kr!td |D}||krnn|r|D]} t| j | j t|d S) zw Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. c"g|] }| Sr6r6).0md_classs r" zmess_ratio.. s++++ +++r%rrKi rr rc3$K|] }|jV dSrA)r,)rdts r" zmess_ratio.. s$!?!?r"(!?!?!?!?!?!?r%) r__subclasses__rzipranger#r'sumprint __class__r,round) rrr detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcrindexdetectorrs r" mess_ratiorsh++#5#D#D#F#F+++I&''!+F O ||13)) 4,.)),/) 04 7vGG   5! ) )H  ++ ) i((( AII%"CCqHH fqj !!?!?Y!?!?!???O"333 * * *B ", ) ) ) ) ! $ $$r%N)rF)' functoolsrtypingrrconstantrrutilsr r r r r rrrrrrrrrrrrr8rOr[rfrqrrrr2r3rxr5rr6r%r"rs!!!!!!!!SSSSSSSS(""""""""D,L,L,L,L,L'9,L,L,L^OOOOO1OOO4EEEEE*EEE0"D"D"D"D"D&8"D"D"DJ1/1/1/1/1/(1/1/1/hWAWAWAWAWA-WAWAWAtBBBBB-BBB>IPIPIPIPIP0IPIPIPX 4Cc]C5=c]C CCCCL 4IN'%'%'%.3'%BF'% '%'%'%'%'%'%r%