diff --git a/libs/guessit/ISO-3166-1_utf8.txt b/libs/guessit/ISO-3166-1_utf8.txt deleted file mode 100644 index 7022040d..00000000 --- a/libs/guessit/ISO-3166-1_utf8.txt +++ /dev/null @@ -1,249 +0,0 @@ -Afghanistan|AF|AFG|004|ISO 3166-2:AF -Åland Islands|AX|ALA|248|ISO 3166-2:AX -Albania|AL|ALB|008|ISO 3166-2:AL -Algeria|DZ|DZA|012|ISO 3166-2:DZ -American Samoa|AS|ASM|016|ISO 3166-2:AS -Andorra|AD|AND|020|ISO 3166-2:AD -Angola|AO|AGO|024|ISO 3166-2:AO -Anguilla|AI|AIA|660|ISO 3166-2:AI -Antarctica|AQ|ATA|010|ISO 3166-2:AQ -Antigua and Barbuda|AG|ATG|028|ISO 3166-2:AG -Argentina|AR|ARG|032|ISO 3166-2:AR -Armenia|AM|ARM|051|ISO 3166-2:AM -Aruba|AW|ABW|533|ISO 3166-2:AW -Australia|AU|AUS|036|ISO 3166-2:AU -Austria|AT|AUT|040|ISO 3166-2:AT -Azerbaijan|AZ|AZE|031|ISO 3166-2:AZ -Bahamas|BS|BHS|044|ISO 3166-2:BS -Bahrain|BH|BHR|048|ISO 3166-2:BH -Bangladesh|BD|BGD|050|ISO 3166-2:BD -Barbados|BB|BRB|052|ISO 3166-2:BB -Belarus|BY|BLR|112|ISO 3166-2:BY -Belgium|BE|BEL|056|ISO 3166-2:BE -Belize|BZ|BLZ|084|ISO 3166-2:BZ -Benin|BJ|BEN|204|ISO 3166-2:BJ -Bermuda|BM|BMU|060|ISO 3166-2:BM -Bhutan|BT|BTN|064|ISO 3166-2:BT -Bolivia, Plurinational State of|BO|BOL|068|ISO 3166-2:BO -Bonaire, Sint Eustatius and Saba|BQ|BES|535|ISO 3166-2:BQ -Bosnia and Herzegovina|BA|BIH|070|ISO 3166-2:BA -Botswana|BW|BWA|072|ISO 3166-2:BW -Bouvet Island|BV|BVT|074|ISO 3166-2:BV -Brazil|BR|BRA|076|ISO 3166-2:BR -British Indian Ocean Territory|IO|IOT|086|ISO 3166-2:IO -Brunei Darussalam|BN|BRN|096|ISO 3166-2:BN -Bulgaria|BG|BGR|100|ISO 3166-2:BG -Burkina Faso|BF|BFA|854|ISO 3166-2:BF -Burundi|BI|BDI|108|ISO 3166-2:BI -Cambodia|KH|KHM|116|ISO 3166-2:KH -Cameroon|CM|CMR|120|ISO 3166-2:CM -Canada|CA|CAN|124|ISO 3166-2:CA -Cape Verde|CV|CPV|132|ISO 3166-2:CV -Cayman Islands|KY|CYM|136|ISO 3166-2:KY -Central African Republic|CF|CAF|140|ISO 3166-2:CF -Chad|TD|TCD|148|ISO 3166-2:TD -Chile|CL|CHL|152|ISO 3166-2:CL -China|CN|CHN|156|ISO 3166-2:CN -Christmas Island|CX|CXR|162|ISO 3166-2:CX -Cocos (Keeling) Islands|CC|CCK|166|ISO 3166-2:CC -Colombia|CO|COL|170|ISO 3166-2:CO -Comoros|KM|COM|174|ISO 3166-2:KM -Congo|CG|COG|178|ISO 3166-2:CG -Congo, the Democratic Republic of the|CD|COD|180|ISO 3166-2:CD -Cook Islands|CK|COK|184|ISO 3166-2:CK -Costa Rica|CR|CRI|188|ISO 3166-2:CR -Côte d'Ivoire|CI|CIV|384|ISO 3166-2:CI -Croatia|HR|HRV|191|ISO 3166-2:HR -Cuba|CU|CUB|192|ISO 3166-2:CU -Curaçao|CW|CUW|531|ISO 3166-2:CW -Cyprus|CY|CYP|196|ISO 3166-2:CY -Czech Republic|CZ|CZE|203|ISO 3166-2:CZ -Denmark|DK|DNK|208|ISO 3166-2:DK -Djibouti|DJ|DJI|262|ISO 3166-2:DJ -Dominica|DM|DMA|212|ISO 3166-2:DM -Dominican Republic|DO|DOM|214|ISO 3166-2:DO -Ecuador|EC|ECU|218|ISO 3166-2:EC -Egypt|EG|EGY|818|ISO 3166-2:EG -El Salvador|SV|SLV|222|ISO 3166-2:SV -Equatorial Guinea|GQ|GNQ|226|ISO 3166-2:GQ -Eritrea|ER|ERI|232|ISO 3166-2:ER -Estonia|EE|EST|233|ISO 3166-2:EE -Ethiopia|ET|ETH|231|ISO 3166-2:ET -Falkland Islands (Malvinas|FK|FLK|238|ISO 3166-2:FK -Faroe Islands|FO|FRO|234|ISO 3166-2:FO -Fiji|FJ|FJI|242|ISO 3166-2:FJ -Finland|FI|FIN|246|ISO 3166-2:FI -France|FR|FRA|250|ISO 3166-2:FR -French Guiana|GF|GUF|254|ISO 3166-2:GF -French Polynesia|PF|PYF|258|ISO 3166-2:PF -French Southern Territories|TF|ATF|260|ISO 3166-2:TF -Gabon|GA|GAB|266|ISO 3166-2:GA -Gambia|GM|GMB|270|ISO 3166-2:GM -Georgia|GE|GEO|268|ISO 3166-2:GE -Germany|DE|DEU|276|ISO 3166-2:DE -Ghana|GH|GHA|288|ISO 3166-2:GH -Gibraltar|GI|GIB|292|ISO 3166-2:GI -Greece|GR|GRC|300|ISO 3166-2:GR -Greenland|GL|GRL|304|ISO 3166-2:GL -Grenada|GD|GRD|308|ISO 3166-2:GD -Guadeloupe|GP|GLP|312|ISO 3166-2:GP -Guam|GU|GUM|316|ISO 3166-2:GU -Guatemala|GT|GTM|320|ISO 3166-2:GT -Guernsey|GG|GGY|831|ISO 3166-2:GG -Guinea|GN|GIN|324|ISO 3166-2:GN -Guinea-Bissau|GW|GNB|624|ISO 3166-2:GW -Guyana|GY|GUY|328|ISO 3166-2:GY -Haiti|HT|HTI|332|ISO 3166-2:HT -Heard Island and McDonald Islands|HM|HMD|334|ISO 3166-2:HM -Holy See (Vatican City State|VA|VAT|336|ISO 3166-2:VA -Honduras|HN|HND|340|ISO 3166-2:HN -Hong Kong|HK|HKG|344|ISO 3166-2:HK -Hungary|HU|HUN|348|ISO 3166-2:HU -Iceland|IS|ISL|352|ISO 3166-2:IS -India|IN|IND|356|ISO 3166-2:IN -Indonesia|ID|IDN|360|ISO 3166-2:ID -Iran, Islamic Republic of|IR|IRN|364|ISO 3166-2:IR -Iraq|IQ|IRQ|368|ISO 3166-2:IQ -Ireland|IE|IRL|372|ISO 3166-2:IE -Isle of Man|IM|IMN|833|ISO 3166-2:IM -Israel|IL|ISR|376|ISO 3166-2:IL -Italy|IT|ITA|380|ISO 3166-2:IT -Jamaica|JM|JAM|388|ISO 3166-2:JM -Japan|JP|JPN|392|ISO 3166-2:JP -Jersey|JE|JEY|832|ISO 3166-2:JE -Jordan|JO|JOR|400|ISO 3166-2:JO -Kazakhstan|KZ|KAZ|398|ISO 3166-2:KZ -Kenya|KE|KEN|404|ISO 3166-2:KE -Kiribati|KI|KIR|296|ISO 3166-2:KI -Korea, Democratic People's Republic of|KP|PRK|408|ISO 3166-2:KP -Korea, Republic of|KR|KOR|410|ISO 3166-2:KR -Kuwait|KW|KWT|414|ISO 3166-2:KW -Kyrgyzstan|KG|KGZ|417|ISO 3166-2:KG -Lao People's Democratic Republic|LA|LAO|418|ISO 3166-2:LA -Latvia|LV|LVA|428|ISO 3166-2:LV -Lebanon|LB|LBN|422|ISO 3166-2:LB -Lesotho|LS|LSO|426|ISO 3166-2:LS -Liberia|LR|LBR|430|ISO 3166-2:LR -Libya|LY|LBY|434|ISO 3166-2:LY -Liechtenstein|LI|LIE|438|ISO 3166-2:LI -Lithuania|LT|LTU|440|ISO 3166-2:LT -Luxembourg|LU|LUX|442|ISO 3166-2:LU -Macao|MO|MAC|446|ISO 3166-2:MO -Macedonia, the former Yugoslav Republic of|MK|MKD|807|ISO 3166-2:MK -Madagascar|MG|MDG|450|ISO 3166-2:MG -Malawi|MW|MWI|454|ISO 3166-2:MW -Malaysia|MY|MYS|458|ISO 3166-2:MY -Maldives|MV|MDV|462|ISO 3166-2:MV -Mali|ML|MLI|466|ISO 3166-2:ML -Malta|MT|MLT|470|ISO 3166-2:MT -Marshall Islands|MH|MHL|584|ISO 3166-2:MH -Martinique|MQ|MTQ|474|ISO 3166-2:MQ -Mauritania|MR|MRT|478|ISO 3166-2:MR -Mauritius|MU|MUS|480|ISO 3166-2:MU -Mayotte|YT|MYT|175|ISO 3166-2:YT -Mexico|MX|MEX|484|ISO 3166-2:MX -Micronesia, Federated States of|FM|FSM|583|ISO 3166-2:FM -Moldova, Republic of|MD|MDA|498|ISO 3166-2:MD -Monaco|MC|MCO|492|ISO 3166-2:MC -Mongolia|MN|MNG|496|ISO 3166-2:MN -Montenegro|ME|MNE|499|ISO 3166-2:ME -Montserrat|MS|MSR|500|ISO 3166-2:MS -Morocco|MA|MAR|504|ISO 3166-2:MA -Mozambique|MZ|MOZ|508|ISO 3166-2:MZ -Myanmar|MM|MMR|104|ISO 3166-2:MM -Namibia|NA|NAM|516|ISO 3166-2:NA -Nauru|NR|NRU|520|ISO 3166-2:NR -Nepal|NP|NPL|524|ISO 3166-2:NP -Netherlands|NL|NLD|528|ISO 3166-2:NL -New Caledonia|NC|NCL|540|ISO 3166-2:NC -New Zealand|NZ|NZL|554|ISO 3166-2:NZ -Nicaragua|NI|NIC|558|ISO 3166-2:NI -Niger|NE|NER|562|ISO 3166-2:NE -Nigeria|NG|NGA|566|ISO 3166-2:NG -Niue|NU|NIU|570|ISO 3166-2:NU -Norfolk Island|NF|NFK|574|ISO 3166-2:NF -Northern Mariana Islands|MP|MNP|580|ISO 3166-2:MP -Norway|NO|NOR|578|ISO 3166-2:NO -Oman|OM|OMN|512|ISO 3166-2:OM -Pakistan|PK|PAK|586|ISO 3166-2:PK -Palau|PW|PLW|585|ISO 3166-2:PW -Palestinian Territory, Occupied|PS|PSE|275|ISO 3166-2:PS -Panama|PA|PAN|591|ISO 3166-2:PA -Papua New Guinea|PG|PNG|598|ISO 3166-2:PG -Paraguay|PY|PRY|600|ISO 3166-2:PY -Peru|PE|PER|604|ISO 3166-2:PE -Philippines|PH|PHL|608|ISO 3166-2:PH -Pitcairn|PN|PCN|612|ISO 3166-2:PN -Poland|PL|POL|616|ISO 3166-2:PL -Portugal|PT|PRT|620|ISO 3166-2:PT -Puerto Rico|PR|PRI|630|ISO 3166-2:PR -Qatar|QA|QAT|634|ISO 3166-2:QA -Réunion|RE|REU|638|ISO 3166-2:RE -Romania|RO|ROU|642|ISO 3166-2:RO -Russian Federation|RU|RUS|643|ISO 3166-2:RU -Rwanda|RW|RWA|646|ISO 3166-2:RW -Saint Barthélemy|BL|BLM|652|ISO 3166-2:BL -Saint Helena, Ascension and Tristan da Cunha|SH|SHN|654|ISO 3166-2:SH -Saint Kitts and Nevis|KN|KNA|659|ISO 3166-2:KN -Saint Lucia|LC|LCA|662|ISO 3166-2:LC -Saint Martin (French part|MF|MAF|663|ISO 3166-2:MF -Saint Pierre and Miquelon|PM|SPM|666|ISO 3166-2:PM -Saint Vincent and the Grenadines|VC|VCT|670|ISO 3166-2:VC -Samoa|WS|WSM|882|ISO 3166-2:WS -San Marino|SM|SMR|674|ISO 3166-2:SM -Sao Tome and Principe|ST|STP|678|ISO 3166-2:ST -Saudi Arabia|SA|SAU|682|ISO 3166-2:SA -Senegal|SN|SEN|686|ISO 3166-2:SN -Serbia|RS|SRB|688|ISO 3166-2:RS -Seychelles|SC|SYC|690|ISO 3166-2:SC -Sierra Leone|SL|SLE|694|ISO 3166-2:SL -Singapore|SG|SGP|702|ISO 3166-2:SG -Sint Maarten (Dutch part|SX|SXM|534|ISO 3166-2:SX -Slovakia|SK|SVK|703|ISO 3166-2:SK -Slovenia|SI|SVN|705|ISO 3166-2:SI -Solomon Islands|SB|SLB|090|ISO 3166-2:SB -Somalia|SO|SOM|706|ISO 3166-2:SO -South Africa|ZA|ZAF|710|ISO 3166-2:ZA -South Georgia and the South Sandwich Islands|GS|SGS|239|ISO 3166-2:GS -South Sudan|SS|SSD|728|ISO 3166-2:SS -Spain|ES|ESP|724|ISO 3166-2:ES -Sri Lanka|LK|LKA|144|ISO 3166-2:LK -Sudan|SD|SDN|729|ISO 3166-2:SD -Suriname|SR|SUR|740|ISO 3166-2:SR -Svalbard and Jan Mayen|SJ|SJM|744|ISO 3166-2:SJ -Swaziland|SZ|SWZ|748|ISO 3166-2:SZ -Sweden|SE|SWE|752|ISO 3166-2:SE -Switzerland|CH|CHE|756|ISO 3166-2:CH -Syrian Arab Republic|SY|SYR|760|ISO 3166-2:SY -Taiwan, Province of China|TW|TWN|158|ISO 3166-2:TW -Tajikistan|TJ|TJK|762|ISO 3166-2:TJ -Tanzania, United Republic of|TZ|TZA|834|ISO 3166-2:TZ -Thailand|TH|THA|764|ISO 3166-2:TH -Timor-Leste|TL|TLS|626|ISO 3166-2:TL -Togo|TG|TGO|768|ISO 3166-2:TG -Tokelau|TK|TKL|772|ISO 3166-2:TK -Tonga|TO|TON|776|ISO 3166-2:TO -Trinidad and Tobago|TT|TTO|780|ISO 3166-2:TT -Tunisia|TN|TUN|788|ISO 3166-2:TN -Turkey|TR|TUR|792|ISO 3166-2:TR -Turkmenistan|TM|TKM|795|ISO 3166-2:TM -Turks and Caicos Islands|TC|TCA|796|ISO 3166-2:TC -Tuvalu|TV|TUV|798|ISO 3166-2:TV -Uganda|UG|UGA|800|ISO 3166-2:UG -Ukraine|UA|UKR|804|ISO 3166-2:UA -United Arab Emirates|AE|ARE|784|ISO 3166-2:AE -United Kingdom|GB|GBR|826|ISO 3166-2:GB -United States|US|USA|840|ISO 3166-2:US -United States Minor Outlying Islands|UM|UMI|581|ISO 3166-2:UM -Uruguay|UY|URY|858|ISO 3166-2:UY -Uzbekistan|UZ|UZB|860|ISO 3166-2:UZ -Vanuatu|VU|VUT|548|ISO 3166-2:VU -Venezuela, Bolivarian Republic of|VE|VEN|862|ISO 3166-2:VE -Viet Nam|VN|VNM|704|ISO 3166-2:VN -Virgin Islands, British|VG|VGB|092|ISO 3166-2:VG -Virgin Islands, U.S|VI|VIR|850|ISO 3166-2:VI -Wallis and Futuna|WF|WLF|876|ISO 3166-2:WF -Western Sahara|EH|ESH|732|ISO 3166-2:EH -Yemen|YE|YEM|887|ISO 3166-2:YE -Zambia|ZM|ZMB|894|ISO 3166-2:ZM -Zimbabwe|ZW|ZWE|716|ISO 3166-2:ZW diff --git a/libs/guessit/ISO-639-2_utf-8.txt b/libs/guessit/ISO-639-2_utf-8.txt deleted file mode 100644 index 2961d219..00000000 --- a/libs/guessit/ISO-639-2_utf-8.txt +++ /dev/null @@ -1,485 +0,0 @@ -aar||aa|Afar|afar -abk||ab|Abkhazian|abkhaze -ace|||Achinese|aceh -ach|||Acoli|acoli -ada|||Adangme|adangme -ady|||Adyghe; Adygei|adyghé -afa|||Afro-Asiatic languages|afro-asiatiques, langues -afh|||Afrihili|afrihili -afr||af|Afrikaans|afrikaans -ain|||Ainu|aïnou -aka||ak|Akan|akan -akk|||Akkadian|akkadien -alb|sqi|sq|Albanian|albanais -ale|||Aleut|aléoute -alg|||Algonquian languages|algonquines, langues -alt|||Southern Altai|altai du Sud -amh||am|Amharic|amharique -ang|||English, Old (ca.450-1100)|anglo-saxon (ca.450-1100) -anp|||Angika|angika -apa|||Apache languages|apaches, langues -ara||ar|Arabic|arabe -arc|||Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)|araméen d'empire (700-300 BCE) -arg||an|Aragonese|aragonais -arm|hye|hy|Armenian|arménien -arn|||Mapudungun; Mapuche|mapudungun; mapuche; mapuce -arp|||Arapaho|arapaho -art|||Artificial languages|artificielles, langues -arw|||Arawak|arawak -asm||as|Assamese|assamais -ast|||Asturian; Bable; Leonese; Asturleonese|asturien; bable; léonais; asturoléonais -ath|||Athapascan languages|athapascanes, langues -aus|||Australian languages|australiennes, langues -ava||av|Avaric|avar -ave||ae|Avestan|avestique -awa|||Awadhi|awadhi -aym||ay|Aymara|aymara -aze||az|Azerbaijani|azéri -bad|||Banda languages|banda, langues -bai|||Bamileke languages|bamiléké, langues -bak||ba|Bashkir|bachkir -bal|||Baluchi|baloutchi -bam||bm|Bambara|bambara -ban|||Balinese|balinais -baq|eus|eu|Basque|basque -bas|||Basa|basa -bat|||Baltic languages|baltes, langues -bej|||Beja; Bedawiyet|bedja -bel||be|Belarusian|biélorusse -bem|||Bemba|bemba -ben||bn|Bengali|bengali -ber|||Berber languages|berbères, langues -bho|||Bhojpuri|bhojpuri -bih||bh|Bihari languages|langues biharis -bik|||Bikol|bikol -bin|||Bini; Edo|bini; edo -bis||bi|Bislama|bichlamar -bla|||Siksika|blackfoot -bnt|||Bantu (Other)|bantoues, autres langues -bos||bs|Bosnian|bosniaque -bra|||Braj|braj -bre||br|Breton|breton -btk|||Batak languages|batak, langues -bua|||Buriat|bouriate -bug|||Buginese|bugi -bul||bg|Bulgarian|bulgare -bur|mya|my|Burmese|birman -byn|||Blin; Bilin|blin; bilen -cad|||Caddo|caddo -cai|||Central American Indian languages|amérindiennes de L'Amérique centrale, langues -car|||Galibi Carib|karib; galibi; carib -cat||ca|Catalan; Valencian|catalan; valencien -cau|||Caucasian languages|caucasiennes, langues -ceb|||Cebuano|cebuano -cel|||Celtic languages|celtiques, langues; celtes, langues -cha||ch|Chamorro|chamorro -chb|||Chibcha|chibcha -che||ce|Chechen|tchétchène -chg|||Chagatai|djaghataï -chi|zho|zh|Chinese|chinois -chk|||Chuukese|chuuk -chm|||Mari|mari -chn|||Chinook jargon|chinook, jargon -cho|||Choctaw|choctaw -chp|||Chipewyan; Dene Suline|chipewyan -chr|||Cherokee|cherokee -chu||cu|Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic|slavon d'église; vieux slave; slavon liturgique; vieux bulgare -chv||cv|Chuvash|tchouvache -chy|||Cheyenne|cheyenne -cmc|||Chamic languages|chames, langues -cop|||Coptic|copte -cor||kw|Cornish|cornique -cos||co|Corsican|corse -cpe|||Creoles and pidgins, English based|créoles et pidgins basés sur l'anglais -cpf|||Creoles and pidgins, French-based |créoles et pidgins basés sur le français -cpp|||Creoles and pidgins, Portuguese-based |créoles et pidgins basés sur le portugais -cre||cr|Cree|cree -crh|||Crimean Tatar; Crimean Turkish|tatar de Crimé -crp|||Creoles and pidgins |créoles et pidgins -csb|||Kashubian|kachoube -cus|||Cushitic languages|couchitiques, langues -cze|ces|cs|Czech|tchèque -dak|||Dakota|dakota -dan||da|Danish|danois -dar|||Dargwa|dargwa -day|||Land Dayak languages|dayak, langues -del|||Delaware|delaware -den|||Slave (Athapascan)|esclave (athapascan) -dgr|||Dogrib|dogrib -din|||Dinka|dinka -div||dv|Divehi; Dhivehi; Maldivian|maldivien -doi|||Dogri|dogri -dra|||Dravidian languages|dravidiennes, langues -dsb|||Lower Sorbian|bas-sorabe -dua|||Duala|douala -dum|||Dutch, Middle (ca.1050-1350)|néerlandais moyen (ca. 1050-1350) -dut|nld|nl|Dutch; Flemish|néerlandais; flamand -dyu|||Dyula|dioula -dzo||dz|Dzongkha|dzongkha -efi|||Efik|efik -egy|||Egyptian (Ancient)|égyptien -eka|||Ekajuk|ekajuk -elx|||Elamite|élamite -eng||en|English|anglais -enm|||English, Middle (1100-1500)|anglais moyen (1100-1500) -epo||eo|Esperanto|espéranto -est||et|Estonian|estonien -ewe||ee|Ewe|éwé -ewo|||Ewondo|éwondo -fan|||Fang|fang -fao||fo|Faroese|féroïen -fat|||Fanti|fanti -fij||fj|Fijian|fidjien -fil|||Filipino; Pilipino|filipino; pilipino -fin||fi|Finnish|finnois -fiu|||Finno-Ugrian languages|finno-ougriennes, langues -fon|||Fon|fon -fre|fra|fr|French|français -frm|||French, Middle (ca.1400-1600)|français moyen (1400-1600) -fro|||French, Old (842-ca.1400)|français ancien (842-ca.1400) -frr|||Northern Frisian|frison septentrional -frs|||Eastern Frisian|frison oriental -fry||fy|Western Frisian|frison occidental -ful||ff|Fulah|peul -fur|||Friulian|frioulan -gaa|||Ga|ga -gay|||Gayo|gayo -gba|||Gbaya|gbaya -gem|||Germanic languages|germaniques, langues -geo|kat|ka|Georgian|géorgien -ger|deu|de|German|allemand -gez|||Geez|guèze -gil|||Gilbertese|kiribati -gla||gd|Gaelic; Scottish Gaelic|gaélique; gaélique écossais -gle||ga|Irish|irlandais -glg||gl|Galician|galicien -glv||gv|Manx|manx; mannois -gmh|||German, Middle High (ca.1050-1500)|allemand, moyen haut (ca. 1050-1500) -goh|||German, Old High (ca.750-1050)|allemand, vieux haut (ca. 750-1050) -gon|||Gondi|gond -gor|||Gorontalo|gorontalo -got|||Gothic|gothique -grb|||Grebo|grebo -grc|||Greek, Ancient (to 1453)|grec ancien (jusqu'à 1453) -gre|ell|el|Greek, Modern (1453-)|grec moderne (après 1453) -grn||gn|Guarani|guarani -gsw|||Swiss German; Alemannic; Alsatian|suisse alémanique; alémanique; alsacien -guj||gu|Gujarati|goudjrati -gwi|||Gwich'in|gwich'in -hai|||Haida|haida -hat||ht|Haitian; Haitian Creole|haïtien; créole haïtien -hau||ha|Hausa|haoussa -haw|||Hawaiian|hawaïen -heb||he|Hebrew|hébreu -her||hz|Herero|herero -hil|||Hiligaynon|hiligaynon -him|||Himachali languages; Western Pahari languages|langues himachalis; langues paharis occidentales -hin||hi|Hindi|hindi -hit|||Hittite|hittite -hmn|||Hmong; Mong|hmong -hmo||ho|Hiri Motu|hiri motu -hrv||hr|Croatian|croate -hsb|||Upper Sorbian|haut-sorabe -hun||hu|Hungarian|hongrois -hup|||Hupa|hupa -iba|||Iban|iban -ibo||ig|Igbo|igbo -ice|isl|is|Icelandic|islandais -ido||io|Ido|ido -iii||ii|Sichuan Yi; Nuosu|yi de Sichuan -ijo|||Ijo languages|ijo, langues -iku||iu|Inuktitut|inuktitut -ile||ie|Interlingue; Occidental|interlingue -ilo|||Iloko|ilocano -ina||ia|Interlingua (International Auxiliary Language Association)|interlingua (langue auxiliaire internationale) -inc|||Indic languages|indo-aryennes, langues -ind||id|Indonesian|indonésien -ine|||Indo-European languages|indo-européennes, langues -inh|||Ingush|ingouche -ipk||ik|Inupiaq|inupiaq -ira|||Iranian languages|iraniennes, langues -iro|||Iroquoian languages|iroquoises, langues -ita||it|Italian|italien -jav||jv|Javanese|javanais -jbo|||Lojban|lojban -jpn||ja|Japanese|japonais -jpr|||Judeo-Persian|judéo-persan -jrb|||Judeo-Arabic|judéo-arabe -kaa|||Kara-Kalpak|karakalpak -kab|||Kabyle|kabyle -kac|||Kachin; Jingpho|kachin; jingpho -kal||kl|Kalaallisut; Greenlandic|groenlandais -kam|||Kamba|kamba -kan||kn|Kannada|kannada -kar|||Karen languages|karen, langues -kas||ks|Kashmiri|kashmiri -kau||kr|Kanuri|kanouri -kaw|||Kawi|kawi -kaz||kk|Kazakh|kazakh -kbd|||Kabardian|kabardien -kha|||Khasi|khasi -khi|||Khoisan languages|khoïsan, langues -khm||km|Central Khmer|khmer central -kho|||Khotanese; Sakan|khotanais; sakan -kik||ki|Kikuyu; Gikuyu|kikuyu -kin||rw|Kinyarwanda|rwanda -kir||ky|Kirghiz; Kyrgyz|kirghiz -kmb|||Kimbundu|kimbundu -kok|||Konkani|konkani -kom||kv|Komi|kom -kon||kg|Kongo|kongo -kor||ko|Korean|coréen -kos|||Kosraean|kosrae -kpe|||Kpelle|kpellé -krc|||Karachay-Balkar|karatchai balkar -krl|||Karelian|carélien -kro|||Kru languages|krou, langues -kru|||Kurukh|kurukh -kua||kj|Kuanyama; Kwanyama|kuanyama; kwanyama -kum|||Kumyk|koumyk -kur||ku|Kurdish|kurde -kut|||Kutenai|kutenai -lad|||Ladino|judéo-espagnol -lah|||Lahnda|lahnda -lam|||Lamba|lamba -lao||lo|Lao|lao -lat||la|Latin|latin -lav||lv|Latvian|letton -lez|||Lezghian|lezghien -lim||li|Limburgan; Limburger; Limburgish|limbourgeois -lin||ln|Lingala|lingala -lit||lt|Lithuanian|lituanien -lol|||Mongo|mongo -loz|||Lozi|lozi -ltz||lb|Luxembourgish; Letzeburgesch|luxembourgeois -lua|||Luba-Lulua|luba-lulua -lub||lu|Luba-Katanga|luba-katanga -lug||lg|Ganda|ganda -lui|||Luiseno|luiseno -lun|||Lunda|lunda -luo|||Luo (Kenya and Tanzania)|luo (Kenya et Tanzanie) -lus|||Lushai|lushai -mac|mkd|mk|Macedonian|macédonien -mad|||Madurese|madourais -mag|||Magahi|magahi -mah||mh|Marshallese|marshall -mai|||Maithili|maithili -mak|||Makasar|makassar -mal||ml|Malayalam|malayalam -man|||Mandingo|mandingue -mao|mri|mi|Maori|maori -map|||Austronesian languages|austronésiennes, langues -mar||mr|Marathi|marathe -mas|||Masai|massaï -may|msa|ms|Malay|malais -mdf|||Moksha|moksa -mdr|||Mandar|mandar -men|||Mende|mendé -mga|||Irish, Middle (900-1200)|irlandais moyen (900-1200) -mic|||Mi'kmaq; Micmac|mi'kmaq; micmac -min|||Minangkabau|minangkabau -mis|||Uncoded languages|langues non codées -mkh|||Mon-Khmer languages|môn-khmer, langues -mlg||mg|Malagasy|malgache -mlt||mt|Maltese|maltais -mnc|||Manchu|mandchou -mni|||Manipuri|manipuri -mno|||Manobo languages|manobo, langues -moh|||Mohawk|mohawk -mon||mn|Mongolian|mongol -mos|||Mossi|moré -mul|||Multiple languages|multilingue -mun|||Munda languages|mounda, langues -mus|||Creek|muskogee -mwl|||Mirandese|mirandais -mwr|||Marwari|marvari -myn|||Mayan languages|maya, langues -myv|||Erzya|erza -nah|||Nahuatl languages|nahuatl, langues -nai|||North American Indian languages|nord-amérindiennes, langues -nap|||Neapolitan|napolitain -nau||na|Nauru|nauruan -nav||nv|Navajo; Navaho|navaho -nbl||nr|Ndebele, South; South Ndebele|ndébélé du Sud -nde||nd|Ndebele, North; North Ndebele|ndébélé du Nord -ndo||ng|Ndonga|ndonga -nds|||Low German; Low Saxon; German, Low; Saxon, Low|bas allemand; bas saxon; allemand, bas; saxon, bas -nep||ne|Nepali|népalais -new|||Nepal Bhasa; Newari|nepal bhasa; newari -nia|||Nias|nias -nic|||Niger-Kordofanian languages|nigéro-kordofaniennes, langues -niu|||Niuean|niué -nno||nn|Norwegian Nynorsk; Nynorsk, Norwegian|norvégien nynorsk; nynorsk, norvégien -nob||nb|Bokmål, Norwegian; Norwegian Bokmål|norvégien bokmål -nog|||Nogai|nogaï; nogay -non|||Norse, Old|norrois, vieux -nor||no|Norwegian|norvégien -nqo|||N'Ko|n'ko -nso|||Pedi; Sepedi; Northern Sotho|pedi; sepedi; sotho du Nord -nub|||Nubian languages|nubiennes, langues -nwc|||Classical Newari; Old Newari; Classical Nepal Bhasa|newari classique -nya||ny|Chichewa; Chewa; Nyanja|chichewa; chewa; nyanja -nym|||Nyamwezi|nyamwezi -nyn|||Nyankole|nyankolé -nyo|||Nyoro|nyoro -nzi|||Nzima|nzema -oci||oc|Occitan (post 1500); Provençal|occitan (après 1500); provençal -oji||oj|Ojibwa|ojibwa -ori||or|Oriya|oriya -orm||om|Oromo|galla -osa|||Osage|osage -oss||os|Ossetian; Ossetic|ossète -ota|||Turkish, Ottoman (1500-1928)|turc ottoman (1500-1928) -oto|||Otomian languages|otomi, langues -paa|||Papuan languages|papoues, langues -pag|||Pangasinan|pangasinan -pal|||Pahlavi|pahlavi -pam|||Pampanga; Kapampangan|pampangan -pan||pa|Panjabi; Punjabi|pendjabi -pap|||Papiamento|papiamento -pau|||Palauan|palau -peo|||Persian, Old (ca.600-400 B.C.)|perse, vieux (ca. 600-400 av. J.-C.) -per|fas|fa|Persian|persan -phi|||Philippine languages|philippines, langues -phn|||Phoenician|phénicien -pli||pi|Pali|pali -pol||pl|Polish|polonais -pon|||Pohnpeian|pohnpei -por||pt|Portuguese|portugais -pra|||Prakrit languages|prâkrit, langues -pro|||Provençal, Old (to 1500)|provençal ancien (jusqu'à 1500) -pus||ps|Pushto; Pashto|pachto -qaa-qtz|||Reserved for local use|réservée à l'usage local -que||qu|Quechua|quechua -raj|||Rajasthani|rajasthani -rap|||Rapanui|rapanui -rar|||Rarotongan; Cook Islands Maori|rarotonga; maori des îles Cook -roa|||Romance languages|romanes, langues -roh||rm|Romansh|romanche -rom|||Romany|tsigane -rum|ron|ro|Romanian; Moldavian; Moldovan|roumain; moldave -run||rn|Rundi|rundi -rup|||Aromanian; Arumanian; Macedo-Romanian|aroumain; macédo-roumain -rus||ru|Russian|russe -sad|||Sandawe|sandawe -sag||sg|Sango|sango -sah|||Yakut|iakoute -sai|||South American Indian (Other)|indiennes d'Amérique du Sud, autres langues -sal|||Salishan languages|salishennes, langues -sam|||Samaritan Aramaic|samaritain -san||sa|Sanskrit|sanskrit -sas|||Sasak|sasak -sat|||Santali|santal -scn|||Sicilian|sicilien -sco|||Scots|écossais -sel|||Selkup|selkoupe -sem|||Semitic languages|sémitiques, langues -sga|||Irish, Old (to 900)|irlandais ancien (jusqu'à 900) -sgn|||Sign Languages|langues des signes -shn|||Shan|chan -sid|||Sidamo|sidamo -sin||si|Sinhala; Sinhalese|singhalais -sio|||Siouan languages|sioux, langues -sit|||Sino-Tibetan languages|sino-tibétaines, langues -sla|||Slavic languages|slaves, langues -slo|slk|sk|Slovak|slovaque -slv||sl|Slovenian|slovène -sma|||Southern Sami|sami du Sud -sme||se|Northern Sami|sami du Nord -smi|||Sami languages|sames, langues -smj|||Lule Sami|sami de Lule -smn|||Inari Sami|sami d'Inari -smo||sm|Samoan|samoan -sms|||Skolt Sami|sami skolt -sna||sn|Shona|shona -snd||sd|Sindhi|sindhi -snk|||Soninke|soninké -sog|||Sogdian|sogdien -som||so|Somali|somali -son|||Songhai languages|songhai, langues -sot||st|Sotho, Southern|sotho du Sud -spa||es|Spanish; Castilian|espagnol; castillan -srd||sc|Sardinian|sarde -srn|||Sranan Tongo|sranan tongo -srp||sr|Serbian|serbe -srr|||Serer|sérère -ssa|||Nilo-Saharan languages|nilo-sahariennes, langues -ssw||ss|Swati|swati -suk|||Sukuma|sukuma -sun||su|Sundanese|soundanais -sus|||Susu|soussou -sux|||Sumerian|sumérien -swa||sw|Swahili|swahili -swe||sv|Swedish|suédois -syc|||Classical Syriac|syriaque classique -syr|||Syriac|syriaque -tah||ty|Tahitian|tahitien -tai|||Tai languages|tai, langues -tam||ta|Tamil|tamoul -tat||tt|Tatar|tatar -tel||te|Telugu|télougou -tem|||Timne|temne -ter|||Tereno|tereno -tet|||Tetum|tetum -tgk||tg|Tajik|tadjik -tgl||tl|Tagalog|tagalog -tha||th|Thai|thaï -tib|bod|bo|Tibetan|tibétain -tig|||Tigre|tigré -tir||ti|Tigrinya|tigrigna -tiv|||Tiv|tiv -tkl|||Tokelau|tokelau -tlh|||Klingon; tlhIngan-Hol|klingon -tli|||Tlingit|tlingit -tmh|||Tamashek|tamacheq -tog|||Tonga (Nyasa)|tonga (Nyasa) -ton||to|Tonga (Tonga Islands)|tongan (Îles Tonga) -tpi|||Tok Pisin|tok pisin -tsi|||Tsimshian|tsimshian -tsn||tn|Tswana|tswana -tso||ts|Tsonga|tsonga -tuk||tk|Turkmen|turkmène -tum|||Tumbuka|tumbuka -tup|||Tupi languages|tupi, langues -tur||tr|Turkish|turc -tut|||Altaic languages|altaïques, langues -tvl|||Tuvalu|tuvalu -twi||tw|Twi|twi -tyv|||Tuvinian|touva -udm|||Udmurt|oudmourte -uga|||Ugaritic|ougaritique -uig||ug|Uighur; Uyghur|ouïgour -ukr||uk|Ukrainian|ukrainien -umb|||Umbundu|umbundu -und|||Undetermined|indéterminée -urd||ur|Urdu|ourdou -uzb||uz|Uzbek|ouszbek -vai|||Vai|vaï -ven||ve|Venda|venda -vie||vi|Vietnamese|vietnamien -vol||vo|Volapük|volapük -vot|||Votic|vote -wak|||Wakashan languages|wakashanes, langues -wal|||Walamo|walamo -war|||Waray|waray -was|||Washo|washo -wel|cym|cy|Welsh|gallois -wen|||Sorbian languages|sorabes, langues -wln||wa|Walloon|wallon -wol||wo|Wolof|wolof -xal|||Kalmyk; Oirat|kalmouk; oïrat -xho||xh|Xhosa|xhosa -yao|||Yao|yao -yap|||Yapese|yapois -yid||yi|Yiddish|yiddish -yor||yo|Yoruba|yoruba -ypk|||Yupik languages|yupik, langues -zap|||Zapotec|zapotèque -zbl|||Blissymbols; Blissymbolics; Bliss|symboles Bliss; Bliss -zen|||Zenaga|zenaga -zha||za|Zhuang; Chuang|zhuang; chuang -znd|||Zande languages|zandé, langues -zul||zu|Zulu|zoulou -zun|||Zuni|zuni -zxx|||No linguistic content; Not applicable|pas de contenu linguistique; non applicable -zza|||Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki|zaza; dimili; dimli; kirdki; kirmanjki; zazaki \ No newline at end of file diff --git a/libs/guessit/__init__.py b/libs/guessit/__init__.py index 4e78bc31..66bcb3d4 100644 --- a/libs/guessit/__init__.py +++ b/libs/guessit/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,70 +18,86 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + +import pkg_resources +from .__version__ import __version__ -__version__ = '0.6.2' __all__ = ['Guess', 'Language', 'guess_file_info', 'guess_video_info', - 'guess_movie_info', 'guess_episode_info'] + 'guess_movie_info', 'guess_episode_info', + 'default_options'] # Do python3 detection before importing any other module, to be sure that # it will then always be available # with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/ import sys -if sys.version_info[0] >= 3: - PY3 = True +if sys.version_info[0] >= 3: # pragma: no cover + PY2, PY3 = False, True unicode_text_type = str native_text_type = str base_text_type = str + def u(x): return str(x) + def s(x): return x + class UnicodeMixin(object): __str__ = lambda x: x.__unicode__() import binascii + def to_hex(x): return binascii.hexlify(x).decode('utf-8') -else: - PY3 = False - __all__ = [ str(s) for s in __all__ ] # fix imports for python2 +else: # pragma: no cover + PY2, PY3 = True, False + __all__ = [str(s) for s in __all__] # fix imports for python2 unicode_text_type = unicode native_text_type = str base_text_type = basestring + def u(x): if isinstance(x, str): return x.decode('utf-8') + if isinstance(x, list): + return [u(s) for s in x] return unicode(x) + def s(x): if isinstance(x, unicode): return x.encode('utf-8') if isinstance(x, list): - return [ s(y) for y in x ] + return [s(y) for y in x] if isinstance(x, tuple): return tuple(s(y) for y in x) if isinstance(x, dict): return dict((s(key), s(value)) for key, value in x.items()) return x + class UnicodeMixin(object): __str__ = lambda x: unicode(x).encode('utf-8') + def to_hex(x): return x.encode('hex') + range = xrange -from guessit.guess import Guess, merge_all + +from guessit.guess import Guess, smart_merge from guessit.language import Language from guessit.matcher import IterativeMatcher -from guessit.textutils import clean_string +from guessit.textutils import clean_default, is_camel, from_camel +import babelfish +import os.path import logging -import json +from copy import deepcopy log = logging.getLogger(__name__) - class NullHandler(logging.Handler): def emit(self, record): pass @@ -91,137 +107,193 @@ h = NullHandler() log.addHandler(h) -def _guess_filename(filename, filetype): - def find_nodes(tree, props): - """Yields all nodes containing any of the given props.""" - if isinstance(props, base_text_type): - props = [props] - for node in tree.nodes(): - if any(prop in node.guess for prop in props): - yield node +def _guess_filename(filename, options=None, **kwargs): + mtree = _build_filename_mtree(filename, options=options, **kwargs) + if options.get('split_camel'): + _add_camel_properties(mtree, options=options) + return mtree.matched() - def warning(title): - log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) - return m - mtree = IterativeMatcher(filename, filetype=filetype) +def _build_filename_mtree(filename, options=None, **kwargs): + mtree = IterativeMatcher(filename, options=options, **kwargs) + second_pass_options = mtree.second_pass_options + if second_pass_options: + log.debug("Running 2nd pass") + merged_options = dict(options) + merged_options.update(second_pass_options) + mtree = IterativeMatcher(filename, options=merged_options, **kwargs) + return mtree - m = mtree.matched() - second_pass_opts = [] - second_pass_transfo_opts = {} +def _add_camel_properties(mtree, options=None, **kwargs): + prop = 'title' if mtree.matched().get('type') != 'episode' else 'series' + value = mtree.matched().get(prop) + _guess_camel_string(mtree, value, options=options, skip_title=False, **kwargs) - # if there are multiple possible years found, we assume the first one is - # part of the title, reparse the tree taking this into account - years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) - if len(years) >= 2: - second_pass_opts.append('skip_first_year') + for leaf in mtree.match_tree.unidentified_leaves(): + value = leaf.value + _guess_camel_string(mtree, value, options=options, skip_title=True, **kwargs) - to_skip_language_nodes = [] - title_nodes = set(n for n in find_nodes(mtree.match_tree, ['title', 'series'])) - title_spans = {} - for title_node in title_nodes: - title_spans[title_node.span[0]] = title_node - title_spans[title_node.span[1]] = title_node +def _guess_camel_string(mtree, string, options=None, skip_title=False, **kwargs): + if string and is_camel(string): + log.debug('"%s" is camel cased. Try to detect more properties.' % (string,)) + uncameled_value = from_camel(string) + merged_options = dict(options) + if 'type' in mtree.match_tree.info: + current_type = mtree.match_tree.info.get('type') + if current_type and current_type != 'unknown': + merged_options['type'] = current_type + camel_tree = _build_filename_mtree(uncameled_value, options=merged_options, name_only=True, skip_title=skip_title, **kwargs) + if len(camel_tree.matched()) > 0: + mtree.matched().update(camel_tree.matched()) + return True + return False - for lang_key in ('language', 'subtitleLanguage'): - langs = {} - lang_nodes = set(n for n in find_nodes(mtree.match_tree, lang_key)) - for lang_node in lang_nodes: - lang = lang_node.guess.get(lang_key, None) - if len(lang_node.value) > 3 and (lang_node.span[0] in title_spans.keys() or lang_node.span[1] in title_spans.keys()): - # Language is next or before title, and is not a language code. Add to skip for 2nd pass. +def guess_video_metadata(filename): + """Gets the video metadata properties out of a given file. The file needs to + exist on the filesystem to be able to be analyzed. An empty guess is + returned otherwise. - # if filetype is subtitle and the language appears last, just before - # the extension, then it is likely a subtitle language - parts = clean_string(lang_node.root.value).split() - if m['type'] in ['moviesubtitle', 'episodesubtitle'] and (parts.index(lang_node.value) == len(parts) - 2): - continue + You need to have the Enzyme python package installed for this to work.""" + result = Guess() - to_skip_language_nodes.append(lang_node) - elif not lang in langs: - langs[lang] = lang_node - else: - # The same language was found. Keep the more confident one, and add others to skip for 2nd pass. - existing_lang_node = langs[lang] - to_skip = None - if existing_lang_node.guess.confidence('language') >= lang_node.guess.confidence('language'): - # lang_node is to remove - to_skip = lang_node + def found(prop, value): + result[prop] = value + log.debug('Found with enzyme %s: %s' % (prop, value)) + + # first get the size of the file, in bytes + try: + size = os.stat(filename).st_size + found('fileSize', size) + + except Exception as e: + log.error('Cannot get video file size: %s' % e) + # file probably does not exist, we might as well return now + return result + + # then get additional metadata from the file using enzyme, if available + try: + import enzyme + + with open(filename) as f: + mkv = enzyme.MKV(f) + + found('duration', mkv.info.duration.total_seconds()) + + if mkv.video_tracks: + video_track = mkv.video_tracks[0] + + # resolution + if video_track.height in (480, 720, 1080): + if video_track.interlaced: + found('screenSize', '%di' % video_track.height) + else: + found('screenSize', '%dp' % video_track.height) else: - # existing_lang_node is to remove - langs[lang] = lang_node - to_skip = existing_lang_node - to_skip_language_nodes.append(to_skip) + # TODO: do we want this? + #found('screenSize', '%dx%d' % (video_track.width, video_track.height)) + pass + + # video codec + if video_track.codec_id == 'V_MPEG4/ISO/AVC': + found('videoCodec', 'h264') + elif video_track.codec_id == 'V_MPEG4/ISO/SP': + found('videoCodec', 'DivX') + elif video_track.codec_id == 'V_MPEG4/ISO/ASP': + found('videoCodec', 'XviD') + + else: + log.warning('MKV has no video track') + + if mkv.audio_tracks: + audio_track = mkv.audio_tracks[0] + # audio codec + if audio_track.codec_id == 'A_AC3': + found('audioCodec', 'AC3') + elif audio_track.codec_id == 'A_DTS': + found('audioCodec', 'DTS') + elif audio_track.codec_id == 'A_AAC': + found('audioCodec', 'AAC') + else: + log.warning('MKV has no audio track') + + if mkv.subtitle_tracks: + embedded_subtitle_languages = set() + for st in mkv.subtitle_tracks: + try: + if st.language: + lang = babelfish.Language.fromalpha3b(st.language) + elif st.name: + lang = babelfish.Language.fromname(st.name) + else: + lang = babelfish.Language('und') + + except babelfish.Error: + lang = babelfish.Language('und') + + embedded_subtitle_languages.add(lang) + + found('subtitleLanguage', embedded_subtitle_languages) + else: + log.debug('MKV has no subtitle track') + + return result + + except ImportError: + log.error('Cannot get video file metadata, missing dependency: enzyme') + log.error('Please install it from PyPI, by doing eg: pip install enzyme') + return result + + except IOError as e: + log.error('Could not open file: %s' % filename) + log.error('Make sure it exists and is available for reading on the filesystem') + log.error('Error: %s' % e) + return result + + except enzyme.Error as e: + log.error('Cannot guess video file metadata') + log.error('enzyme.Error while reading file: %s' % filename) + log.error('Error: %s' % e) + return result + +default_options = {} - if to_skip_language_nodes: - second_pass_transfo_opts['guess_language'] = ( - ((), { 'skip': [ { 'node_idx': node.parent.node_idx, - 'span': node.span } - for node in to_skip_language_nodes ] })) - - if second_pass_opts or second_pass_transfo_opts: - # 2nd pass is needed - log.info("Running 2nd pass with options: %s" % second_pass_opts) - log.info("Transfo options: %s" % second_pass_transfo_opts) - mtree = IterativeMatcher(filename, filetype=filetype, - opts=second_pass_opts, - transfo_opts=second_pass_transfo_opts) - - m = mtree.matched() - - if 'language' not in m and 'subtitleLanguage' not in m or 'title' not in m: - return m - - # if we found some language, make sure we didn't cut a title or sth... - mtree2 = IterativeMatcher(filename, filetype=filetype, - opts=['nolanguage', 'nocountry']) - m2 = mtree2.matched() - - if m.get('title') != m2.get('title'): - title = next(find_nodes(mtree.match_tree, 'title')) - title2 = next(find_nodes(mtree2.match_tree, 'title')) - - # if a node is in an explicit group, then the correct title is probably - # the other one - if title.root.node_at(title.node_idx[:2]).is_explicit(): - return m2 - elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): - return m - - return m - - -def guess_file_info(filename, filetype='autodetect', info=None): +def guess_file_info(filename, info=None, options=None, **kwargs): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. - >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) - {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} + >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt') + >>> g = guess_file_info(testfile, info = ['hash_md5', 'hash_sha1']) + >>> g['hash_md5'], g['hash_sha1'] + ('64de6b5893cac24456c46a935ef9c359', 'a703fc0fa4518080505809bf562c6fc6f7b3c98c') """ + info = info or 'filename' + options = options or {} + if default_options: + merged_options = deepcopy(default_options) + merged_options.update(options) + options = merged_options + result = [] hashers = [] # Force unicode as soon as possible filename = u(filename) - if info is None: - info = ['filename'] - if isinstance(info, base_text_type): info = [info] for infotype in info: if infotype == 'filename': - result.append(_guess_filename(filename, filetype)) + result.append(_guess_filename(filename, options, **kwargs)) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: - result.append(Guess({'hash_mpc': hash_file(filename)}, + result.append(Guess({infotype: hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute MPC-style hash because: %s' % e) @@ -229,7 +301,7 @@ def guess_file_info(filename, filetype='autodetect', info=None): elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: - result.append(Guess({'hash_ed2k': hash_file(filename)}, + result.append(Guess({infotype: hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute ed2k hash because: %s' % e) @@ -243,6 +315,11 @@ def guess_file_info(filename, filetype='autodetect', info=None): except AttributeError: log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname) + elif infotype == 'video': + g = guess_video_metadata(filename) + if g: + result.append(g) + else: log.warning('Invalid infotype: %s' % infotype) @@ -265,25 +342,18 @@ def guess_file_info(filename, filetype='autodetect', info=None): except Exception as e: log.warning('Could not compute hash because: %s' % e) - result = merge_all(result) - - # last minute adjustments - - # if country is in the guessed properties, make it part of the filename - if 'series' in result and 'country' in result: - result['series'] += ' (%s)' % result['country'].alpha2.upper() - + result = smart_merge(result) return result -def guess_video_info(filename, info=None): - return guess_file_info(filename, 'autodetect', info) +def guess_video_info(filename, info=None, options=None, **kwargs): + return guess_file_info(filename, info=info, options=options, type='video', **kwargs) -def guess_movie_info(filename, info=None): - return guess_file_info(filename, 'movie', info) +def guess_movie_info(filename, info=None, options=None, **kwargs): + return guess_file_info(filename, info=info, options=options, type='movie', **kwargs) -def guess_episode_info(filename, info=None): - return guess_file_info(filename, 'episode', info) +def guess_episode_info(filename, info=None, options=None, **kwargs): + return guess_file_info(filename, info=info, options=options, type='episode', **kwargs) diff --git a/libs/guessit/__main__.py b/libs/guessit/__main__.py index ccfa3af6..759c380b 100644 --- a/libs/guessit/__main__.py +++ b/libs/guessit/__main__.py @@ -2,7 +2,8 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2013 Rémi Alvergnat # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,109 +19,265 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from __future__ import print_function -from guessit import u -from guessit import slogging, guess_file_info -from optparse import OptionParser +from __future__ import absolute_import, division, print_function, unicode_literals +from collections import defaultdict import logging -import sys import os -import locale + +from guessit import PY2, u, guess_file_info, __version__ +from guessit.options import get_opts +from guessit.__version__ import __version__ -def detect_filename(filename, filetype, info=['filename'], advanced = False): +def guess_file(filename, info='filename', options=None, **kwargs): + options = options or {} filename = u(filename) - print('For:', filename) - print('GuessIt found:', guess_file_info(filename, filetype, info).nice_string(advanced)) + if not options.get('yaml') and not options.get('show_property'): + print('For:', filename) + guess = guess_file_info(filename, info, options, **kwargs) + + if not options.get('unidentified'): + try: + del guess['unidentified'] + except KeyError: + pass + + if options.get('show_property'): + print(guess.get(options.get('show_property'), '')) + return + + if options.get('yaml'): + import yaml + for k, v in guess.items(): + if isinstance(v, list) and len(v) == 1: + guess[k] = v[0] + ystr = yaml.safe_dump({filename: dict(guess)}, default_flow_style=False) + i = 0 + for yline in ystr.splitlines(): + if i == 0: + print("? " + yline[:-1]) + elif i == 1: + print(":" + yline[1:]) + else: + print(yline) + i += 1 + return + print('GuessIt found:', guess.nice_string(options.get('advanced'))) -def run_demo(episodes=True, movies=True, advanced=False): +def _supported_properties(): + all_properties = defaultdict(list) + transformers_properties = [] + + from guessit.plugins import transformers + for transformer in transformers.all_transformers(): + supported_properties = transformer.supported_properties() + transformers_properties.append((transformer, supported_properties)) + + if isinstance(supported_properties, dict): + for property_name, possible_values in supported_properties.items(): + all_properties[property_name].extend(possible_values) + else: + for property_name in supported_properties: + all_properties[property_name] # just make sure it exists + + return all_properties, transformers_properties + + +def display_transformers(): + print('GuessIt transformers:') + _, transformers_properties = _supported_properties() + for transformer, _ in transformers_properties: + print('[@] %s (%s)' % (transformer.name, transformer.priority)) + + +def display_properties(options): + values = options.values + transformers = options.transformers + name_only = options.name_only + + print('GuessIt properties:') + all_properties, transformers_properties = _supported_properties() + if name_only: + # the 'container' property does not apply when using the --name-only + # option + del all_properties['container'] + + if transformers: + for transformer, properties_list in transformers_properties: + print('[@] %s (%s)' % (transformer.name, transformer.priority)) + for property_name in properties_list: + property_values = all_properties.get(property_name) + print(' [+] %s' % (property_name,)) + if property_values and values: + _display_property_values(property_name, indent=4) + else: + properties_list = sorted(all_properties.keys()) + for property_name in properties_list: + property_values = all_properties.get(property_name) + print(' [+] %s' % (property_name,)) + if property_values and values: + _display_property_values(property_name, indent=4) + + +def _display_property_values(property_name, indent=2): + all_properties, _ = _supported_properties() + property_values = all_properties.get(property_name) + for property_value in property_values: + print(indent * ' ' + '[!] %s' % (property_value,)) + + +def run_demo(episodes=True, movies=True, options=None): # NOTE: tests should not be added here but rather in the tests/ folder # this is just intended as a quick example if episodes: - testeps = [ 'Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi', - 'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi', - 'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi', - 'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi', - 'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi', - 'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg', - 'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi', - 'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi', - 'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi' - ] + testeps = ['Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi', + 'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi', + 'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi', + 'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi', + 'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi', + 'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg', + 'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi', + 'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi', + 'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi'] for f in testeps: - print('-'*80) - detect_filename(f, filetype='episode', advanced=advanced) - + print('-' * 80) + guess_file(f, options=options, type='episode') if movies: - testmovies = [ 'Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv', - 'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi', - 'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi', - 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv', - 'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv', - 'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', # FIXME: PROPER and R5 get overwritten - '[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', # FIXME: title gets overwritten - 'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi', - 'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt', - 'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv', - 'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv', - 'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi', - 'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi', - 'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi', - 'Movies/Juno (2007)/Juno KLAXXON.avi', - 'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv', - 'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt', - 'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi', - 'testsmewt_bugs/movies/Baraka_Edition_Collector.avi' - ] + testmovies = ['Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv', + 'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi', + 'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi', + 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv', + 'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv', + 'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', + '[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', + 'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi', + 'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt', + 'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv', + 'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv', + 'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi', + 'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi', + 'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi', + 'Movies/Juno (2007)/Juno KLAXXON.avi', + 'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv', + 'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt', + 'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi', + 'testsmewt_bugs/movies/Baraka_Edition_Collector.avi' + ] for f in testmovies: - print('-'*80) - detect_filename(f, filetype = 'movie', advanced = advanced) + print('-' * 80) + guess_file(f, options=options, type='movie') -def main(): - slogging.setupLogging() +def submit_bug(filename, options): + import requests # only import when needed + from requests.exceptions import RequestException - # see http://bugs.python.org/issue2128 - if sys.version_info.major < 3 and os.name == 'nt': - for i, a in enumerate(sys.argv): - sys.argv[i] = a.decode(locale.getpreferredencoding()) - - parser = OptionParser(usage = 'usage: %prog [options] file1 [file2...]') - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help = 'display debug output') - parser.add_option('-i', '--info', dest = 'info', default = 'filename', - help = 'the desired information type: filename, hash_mpc or a hash from python\'s ' - 'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of ' - 'them, comma-separated') - parser.add_option('-t', '--type', dest = 'filetype', default = 'autodetect', - help = 'the suggested file type: movie, episode or autodetect') - parser.add_option('-a', '--advanced', dest = 'advanced', action='store_true', default = False, - help = 'display advanced information for filename guesses, as json output') - parser.add_option('-d', '--demo', action='store_true', dest='demo', default=False, - help = 'run a few builtin tests instead of analyzing a file') + try: + opts = dict((k, v) for k, v in options.__dict__.items() + if v and k != 'submit_bug') - options, args = parser.parse_args() + r = requests.post('http://localhost:5000/bugs', {'filename': filename, + 'version': __version__, + 'options': str(opts)}) + if r.status_code == 200: + print('Successfully submitted file: %s' % r.text) + else: + print('Could not submit bug at the moment, please try again later.') + + except RequestException as e: + print('Could not submit bug at the moment, please try again later.') + + +def main(args=None, setup_logging=True): + if setup_logging: + from guessit import slogging + slogging.setup_logging() + + if PY2: # pragma: no cover + import codecs + import locale + import sys + + # see http://bugs.python.org/issue2128 + if os.name == 'nt': + for i, a in enumerate(sys.argv): + sys.argv[i] = a.decode(locale.getpreferredencoding()) + + # see https://github.com/wackou/guessit/issues/43 + # and http://stackoverflow.com/questions/4545661/unicodedecodeerror-when-redirecting-to-file + # Wrap sys.stdout into a StreamWriter to allow writing unicode. + sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) + + from guessit.plugins import transformers + + if args: + options = get_opts().parse_args(args) + else: # pragma: no cover + options = get_opts().parse_args() if options.verbose: - logging.getLogger('guessit').setLevel(logging.DEBUG) + logging.getLogger().setLevel(logging.DEBUG) + + help_required = True + if options.properties or options.values: + display_properties(options) + help_required = False + elif options.transformers: + display_transformers() + help_required = False if options.demo: - run_demo(episodes=True, movies=True, advanced=options.advanced) - else: - if args: - for filename in args: - detect_filename(filename, - filetype = options.filetype, - info = options.info.split(','), - advanced = options.advanced) + run_demo(episodes=True, movies=True, options=vars(options)) + help_required = False + if options.version: + print('+-------------------------------------------------------+') + print('+ GuessIt ' + __version__ + (28-len(__version__)) * ' ' + '+') + print('+-------------------------------------------------------+') + print('| Please report any bug or feature request at |') + print('| https://github.com/wackou/guessit/issues. |') + print('+-------------------------------------------------------+') + help_required = False + + if options.yaml: + try: + import yaml, babelfish + def default_representer(dumper, data): + return dumper.represent_str(str(data)) + yaml.SafeDumper.add_representer(babelfish.Language, default_representer) + yaml.SafeDumper.add_representer(babelfish.Country, default_representer) + except ImportError: # pragma: no cover + print('PyYAML not found. Using default output.') + + filenames = [] + if options.filename: + filenames.extend(options.filename) + if options.input_file: + input_file = open(options.input_file, 'r') + try: + filenames.extend([line.strip() for line in input_file.readlines()]) + finally: + input_file.close() + + filenames = filter(lambda f: f, filenames) + + if filenames: + help_required = False + if options.submit_bug: + for filename in filenames: + submit_bug(filename, options) else: - parser.print_help() + for filename in filenames: + guess_file(filename, + info=options.info.split(','), + options=vars(options)) + + if help_required: # pragma: no cover + get_opts().print_help() if __name__ == '__main__': main() diff --git a/libs/guessit/__version__.py b/libs/guessit/__version__.py new file mode 100644 index 00000000..f8ec056e --- /dev/null +++ b/libs/guessit/__version__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# +__version__ = '0.10.2.dev0' diff --git a/libs/guessit/containers.py b/libs/guessit/containers.py new file mode 100644 index 00000000..74847008 --- /dev/null +++ b/libs/guessit/containers.py @@ -0,0 +1,771 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2013 Rémi Alvergnat +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from .patterns import compile_pattern, sep +from . import base_text_type +from .guess import Guess +import types + + +def _get_span(prop, match): + """Retrieves span for a match""" + if not prop.global_span and match.re.groups: + start = None + end = None + for i in range(1, match.re.groups + 1): + span = match.span(i) + if start is None or span[0] < start: + start = span[0] + if end is None or span[1] > end: + end = span[1] + return start, end + else: + return match.span() + start = span[0] + end = span[1] + + +def _trim_span(span, value, blanks = sep): + start, end = span + + for i in range(0, len(value)): + if value[i] in blanks: + start += 1 + else: + break + + for i in reversed(range(0, len(value))): + if value[i] in blanks: + end -= 1 + else: + break + if end <= start: + return -1, -1 + return start, end + + +def _get_groups(compiled_re): + """ + Retrieves groups from re + + :return: list of group names + """ + if compiled_re.groups: + indexgroup = {} + for k, i in compiled_re.groupindex.items(): + indexgroup[i] = k + ret = [] + for i in range(1, compiled_re.groups + 1): + ret.append(indexgroup.get(i, i)) + return ret + else: + return [None] + + +class NoValidator(object): + def validate(self, prop, string, node, match, entry_start, entry_end): + return True + + +class LeftValidator(object): + """Make sure our match is starting by separator, or by another entry""" + + def validate(self, prop, string, node, match, entry_start, entry_end): + span = _get_span(prop, match) + span = _trim_span(span, string[span[0]:span[1]]) + start, end = span + + sep_start = start <= 0 or string[start - 1] in sep + start_by_other = start in entry_end + if not sep_start and not start_by_other: + return False + return True + + +class RightValidator(object): + """Make sure our match is ended by separator, or by another entry""" + + def validate(self, prop, string, node, match, entry_start, entry_end): + span = _get_span(prop, match) + span = _trim_span(span, string[span[0]:span[1]]) + start, end = span + + sep_end = end >= len(string) or string[end] in sep + end_by_other = end in entry_start + if not sep_end and not end_by_other: + return False + return True + + +class ChainedValidator(object): + def __init__(self, *validators): + self._validators = validators + + def validate(self, prop, string, node, match, entry_start, entry_end): + for validator in self._validators: + if not validator.validate(prop, string, node, match, entry_start, entry_end): + return False + return True + + +class SameKeyValidator(object): + def __init__(self, validator_function): + self.validator_function = validator_function + + def validate(self, prop, string, node, match, entry_start, entry_end): + for key in prop.keys: + for same_value_leaf in node.root.leaves_containing(key): + ret = self.validator_function(same_value_leaf, key, prop, string, node, match, entry_start, entry_end) + if ret is not None: + return ret + return True + + +class OnlyOneValidator(SameKeyValidator): + def __init__(self): + super(OnlyOneValidator, self).__init__(lambda same_value_leaf, key, prop, string, node, match, entry_start, entry_end: False) + + +class DefaultValidator(object): + """Make sure our match is surrounded by separators, or by another entry""" + def validate(self, prop, string, node, match, entry_start, entry_end): + span = _get_span(prop, match) + span = _trim_span(span, string[span[0]:span[1]]) + start, end = span + + sep_start = start <= 0 or string[start - 1] in sep + sep_end = end >= len(string) or string[end] in sep + start_by_other = start in entry_end + end_by_other = end in entry_start + if (sep_start or start_by_other) and (sep_end or end_by_other): + return True + return False + + +class FunctionValidator(object): + def __init__(self, function): + self.function = function + + def validate(self, prop, string, node, match, entry_start, entry_end): + return self.function(prop, string, node, match, entry_start, entry_end) + + +class FormatterValidator(object): + def __init__(self, group_name=None, formatted_validator=None): + self.group_name = group_name + self.formatted_validator = formatted_validator + + def validate(self, prop, string, node, match, entry_start, entry_end): + if self.group_name: + formatted = prop.format(match.group(self.group_name), self.group_name) + else: + formatted = prop.format(match.group()) + if self.formatted_validator: + return self.formatted_validator(formatted) + else: + return formatted + + +def _get_positions(prop, string, node, match, entry_start, entry_end): + span = match.span() + start = span[0] + end = span[1] + + at_start = True + at_end = True + + while start > 0: + start -= 1 + if string[start] not in sep: + at_start = False + break + while end < len(string) - 1: + end += 1 + if string[end] not in sep: + at_end = False + break + return at_start, at_end + + +class WeakValidator(DefaultValidator): + """Make sure our match is surrounded by separators and is the first or last element in the string""" + def validate(self, prop, string, node, match, entry_start, entry_end): + if super(WeakValidator, self).validate(prop, string, node, match, entry_start, entry_end): + at_start, at_end = _get_positions(prop, string, node, match, entry_start, entry_end) + return at_start or at_end + return False + + +class NeighborValidator(DefaultValidator): + """Make sure the node is next another one""" + def validate(self, prop, string, node, match, entry_start, entry_end): + at_start, at_end = _get_positions(prop, string, node, match, entry_start, entry_end) + + if at_start: + previous_leaf = node.root.previous_leaf(node) + if previous_leaf is not None: + return True + + if at_end: + next_leaf = node.root.next_leaf(node) + if next_leaf is not None: + return True + + return False + + +class LeavesValidator(DefaultValidator): + def __init__(self, lambdas=None, previous_lambdas=None, next_lambdas=None, both_side=False, default_=True): + self.previous_lambdas = previous_lambdas if previous_lambdas is not None else [] + self.next_lambdas = next_lambdas if next_lambdas is not None else [] + if lambdas: + self.previous_lambdas.extend(lambdas) + self.next_lambdas.extend(lambdas) + self.both_side = both_side + self.default_ = default_ + + """Make sure our match is surrounded by separators and validates defined lambdas""" + def validate(self, prop, string, node, match, entry_start, entry_end): + if self.default_: + super_ret = super(LeavesValidator, self).validate(prop, string, node, match, entry_start, entry_end) + else: + super_ret = True + if not super_ret: + return False + + previous_ = self._validate_previous(prop, string, node, match, entry_start, entry_end) + next_ = self._validate_next(prop, string, node, match, entry_start, entry_end) + + if previous_ is None and next_ is None: + return super_ret + if self.both_side: + return previous_ and next_ + else: + return previous_ or next_ + + def _validate_previous(self, prop, string, node, match, entry_start, entry_end): + if self.previous_lambdas: + for leaf in node.root.previous_leaves(node): + for lambda_ in self.previous_lambdas: + ret = self._check_rule(lambda_, leaf) + if ret is not None: + return ret + return False + + def _validate_next(self, prop, string, node, match, entry_start, entry_end): + if self.next_lambdas: + for leaf in node.root.next_leaves(node): + for lambda_ in self.next_lambdas: + ret = self._check_rule(lambda_, leaf) + if ret is not None: + return ret + return False + + def _check_rule(self, lambda_, previous_leaf): + return lambda_(previous_leaf) + + +class _Property: + """Represents a property configuration.""" + def __init__(self, keys=None, pattern=None, canonical_form=None, canonical_from_pattern=True, confidence=1.0, enhance=True, global_span=False, validator=DefaultValidator(), formatter=None, disabler=None, confidence_lambda=None): + """ + :param keys: Keys of the property (format, screenSize, ...) + :type keys: string + :param canonical_form: Unique value of the property (DVD, 720p, ...) + :type canonical_form: string + :param pattern: Regexp pattern + :type pattern: string + :param confidence: confidence + :type confidence: float + :param enhance: enhance the pattern + :type enhance: boolean + :param global_span: if True, the whole match span will used to create the Guess. + Else, the span from the capturing groups will be used. + :type global_span: boolean + :param validator: Validator to use + :type validator: :class:`DefaultValidator` + :param formatter: Formater to use + :type formatter: function + """ + if isinstance(keys, list): + self.keys = keys + elif isinstance(keys, base_text_type): + self.keys = [keys] + else: + self.keys = [] + self.canonical_form = canonical_form + if pattern is not None: + self.pattern = pattern + else: + self.pattern = canonical_form + if self.canonical_form is None and canonical_from_pattern: + self.canonical_form = self.pattern + self.compiled = compile_pattern(self.pattern, enhance=enhance) + for group_name in _get_groups(self.compiled): + if isinstance(group_name, base_text_type) and not group_name in self.keys: + self.keys.append(group_name) + if not self.keys: + raise ValueError("No property key is defined") + self.confidence = confidence + self.confidence_lambda = confidence_lambda + self.global_span = global_span + self.validator = validator + self.formatter = formatter + self.disabler = disabler + + def disabled(self, options): + if self.disabler: + return self.disabler(options) + return False + + def format(self, value, group_name=None): + """Retrieves the final value from re group match value""" + formatter = None + if isinstance(self.formatter, dict): + formatter = self.formatter.get(group_name) + if formatter is None and group_name is not None: + formatter = self.formatter.get(None) + else: + formatter = self.formatter + if isinstance(formatter, types.FunctionType): + return formatter(value) + elif formatter is not None: + return formatter.format(value) + return value + + def __repr__(self): + return "%s: %s" % (self.keys, self.canonical_form if self.canonical_form else self.pattern) + + +class PropertiesContainer(object): + def __init__(self, **kwargs): + self._properties = [] + self.default_property_kwargs = kwargs + + def unregister_property(self, name, *canonical_forms): + """Unregister a property canonical forms + + If canonical_forms are specified, only those values will be unregistered + + :param name: Property name to unregister + :type name: string + :param canonical_forms: Values to unregister + :type canonical_forms: varargs of string + """ + _properties = [prop for prop in self._properties if prop.name == name and (not canonical_forms or prop.canonical_form in canonical_forms)] + + def register_property(self, name, *patterns, **property_params): + """Register property with defined canonical form and patterns. + + :param name: name of the property (format, screenSize, ...) + :type name: string + :param patterns: regular expression patterns to register for the property canonical_form + :type patterns: varargs of string + """ + properties = [] + for pattern in patterns: + params = dict(self.default_property_kwargs) + params.update(property_params) + if isinstance(pattern, dict): + params.update(pattern) + prop = _Property(name, **params) + else: + prop = _Property(name, pattern, **params) + self._properties.append(prop) + properties.append(prop) + return properties + + def register_canonical_properties(self, name, *canonical_forms, **property_params): + """Register properties from their canonical forms. + + :param name: name of the property (releaseGroup, ...) + :type name: string + :param canonical_forms: values of the property ('ESiR', 'WAF', 'SEPTiC', ...) + :type canonical_forms: varargs of strings + """ + properties = [] + for canonical_form in canonical_forms: + params = dict(property_params) + params['canonical_form'] = canonical_form + properties.extend(self.register_property(name, canonical_form, **property_params)) + return properties + + def unregister_all_properties(self): + """Unregister all defined properties""" + self._properties.clear() + + def find_properties(self, string, node, options, name=None, validate=True, re_match=False, sort=True, multiple=False): + """Find all distinct properties for given string + + If no capturing group is defined in the property, value will be grabbed from the entire match. + + If one ore more unnamed capturing group is defined in the property, first capturing group will be used. + + If named capturing group are defined in the property, they will be returned as property key. + + If validate, found properties will be validated by their defined validator + + If re_match, re.match will be used instead of re.search. + + if sort, found properties will be sorted from longer match to shorter match. + + If multiple is False and multiple values are found for the same property, the more confident one will be returned. + + If multiple is False and multiple values are found for the same property and the same confidence, the longer will be returned. + + :param string: input string + :type string: string + + :param node: current node of the matching tree + :type node: :class:`guessit.matchtree.MatchTree` + + :param name: name of property to find + :type name: string + + :param re_match: use re.match instead of re.search + :type re_match: bool + + :param multiple: Allows multiple property values to be returned + :type multiple: bool + + :return: found properties + :rtype: list of tuples (:class:`_Property`, match, list of tuples (property_name, tuple(value_start, value_end))) + + :see: `_Property` + :see: `register_property` + :see: `register_canonical_properties` + """ + entry_start = {} + entry_end = {} + + entries = [] + duplicate_matches = {} + + ret = [] + + if not string.strip(): + return ret + + # search all properties + for prop in self.get_properties(name): + if not prop.disabled(options): + valid_match = None + if re_match: + match = prop.compiled.match(string) + if match: + entries.append((prop, match)) + else: + matches = list(prop.compiled.finditer(string)) + duplicate_matches[prop] = matches + for match in matches: + entries.append((prop, match)) + + for prop, match in entries: + # compute confidence + if prop.confidence_lambda: + computed_confidence = prop.confidence_lambda(match) + if computed_confidence is not None: + prop.confidence = computed_confidence + + if validate: + # compute entries start and ends + for prop, match in entries: + start, end = _get_span(prop, match) + + if start not in entry_start: + entry_start[start] = [prop] + else: + entry_start[start].append(prop) + + if end not in entry_end: + entry_end[end] = [prop] + else: + entry_end[end].append(prop) + + # remove invalid values + while True: + invalid_entries = [] + for entry in entries: + prop, match = entry + if not prop.validator.validate(prop, string, node, match, entry_start, entry_end): + invalid_entries.append(entry) + if not invalid_entries: + break + for entry in invalid_entries: + prop, match = entry + entries.remove(entry) + prop_duplicate_matches = duplicate_matches.get(prop) + if prop_duplicate_matches: + prop_duplicate_matches.remove(match) + invalid_span = _get_span(prop, match) + start = invalid_span[0] + end = invalid_span[1] + entry_start[start].remove(prop) + if not entry_start.get(start): + del entry_start[start] + entry_end[end].remove(prop) + if not entry_end.get(end): + del entry_end[end] + + for prop, prop_duplicate_matches in duplicate_matches.items(): + # Keeping the last valid match. + # Needed for the.100.109.hdtv-lol.mp4 + for duplicate_match in prop_duplicate_matches[:-1]: + entries.remove((prop, duplicate_match)) + + if multiple: + ret = entries + else: + # keep only best match if multiple values where found + entries_dict = {} + for entry in entries: + for key in prop.keys: + if key not in entries_dict: + entries_dict[key] = [] + entries_dict[key].append(entry) + + for key_entries in entries_dict.values(): + if multiple: + for entry in key_entries: + ret.append(entry) + else: + best_ret = {} + + best_prop, best_match = None, None + if len(key_entries) == 1: + best_prop, best_match = key_entries[0] + else: + for prop, match in key_entries: + start, end = _get_span(prop, match) + if not best_prop or \ + best_prop.confidence < best_prop.confidence or \ + best_prop.confidence == best_prop.confidence and \ + best_match.span()[1] - best_match.span()[0] < match.span()[1] - match.span()[0]: + best_prop, best_match = prop, match + + best_ret[best_prop] = best_match + + for prop, match in best_ret.items(): + ret.append((prop, match)) + + if sort: + def _sorting(x): + _, x_match = x + x_start, x_end = x_match.span() + return x_start - x_end + + ret.sort(key=_sorting) + + return ret + + def as_guess(self, found_properties, input=None, filter_=None, sep_replacement=None, multiple=False, *args, **kwargs): + if filter_ is None: + filter_ = lambda property, *args, **kwargs: True + guesses = [] if multiple else None + for prop, match in found_properties: + first_key = None + for key in prop.keys: + # First property key will be used as base for effective name + if isinstance(key, base_text_type): + if first_key is None: + first_key = key + break + property_name = first_key if first_key else None + span = _get_span(prop, match) + guess = Guess(confidence=prop.confidence, input=input, span=span, prop=property_name) + groups = _get_groups(match.re) + for group_name in groups: + name = group_name if isinstance(group_name, base_text_type) else property_name if property_name not in groups else None + if name: + value = self._effective_prop_value(prop, group_name, input, match.span(group_name) if group_name else match.span(), sep_replacement) + if not value is None: + is_string = isinstance(value, base_text_type) + if not is_string or is_string and value: # Keep non empty strings and other defined objects + if isinstance(value, dict): + for k, v in value.items(): + if k is None: + k = name + guess[k] = v + else: + if name in guess: + if not isinstance(guess[name], list): + guess[name] = [guess[name]] + guess[name].append(value) + else: + guess[name] = value + if group_name: + guess.metadata(prop).span = match.span(group_name) + if filter_(guess): + if multiple: + guesses.append(guess) + else: + return guess + return guesses + + def _effective_prop_value(self, prop, group_name, input=None, span=None, sep_replacement=None): + if prop.canonical_form: + return prop.canonical_form + if input is None: + return None + value = input + if span is not None: + value = value[span[0]:span[1]] + value = input[span[0]:span[1]] if input else None + if sep_replacement: + for sep_char in sep: + value = value.replace(sep_char, sep_replacement) + if value: + value = prop.format(value, group_name) + return value + + def get_properties(self, name=None, canonical_form=None): + """Retrieve properties + + :return: Properties + :rtype: generator + """ + for prop in self._properties: + if (name is None or name in prop.keys) and (canonical_form is None or prop.canonical_form == canonical_form): + yield prop + + def get_supported_properties(self): + supported_properties = {} + for prop in self.get_properties(): + for k in prop.keys: + values = supported_properties.get(k) + if not values: + values = set() + supported_properties[k] = values + if prop.canonical_form: + values.add(prop.canonical_form) + return supported_properties + + +class QualitiesContainer(): + def __init__(self): + self._qualities = {} + + def register_quality(self, name, canonical_form, rating): + """Register a quality rating. + + :param name: Name of the property + :type name: string + :param canonical_form: Value of the property + :type canonical_form: string + :param rating: Estimated quality rating for the property + :type rating: int + """ + property_qualities = self._qualities.get(name) + + if property_qualities is None: + property_qualities = {} + self._qualities[name] = property_qualities + + property_qualities[canonical_form] = rating + + def unregister_quality(self, name, *canonical_forms): + """Unregister quality ratings for given property name. + + If canonical_forms are specified, only those values will be unregistered + + :param name: Name of the property + :type name: string + :param canonical_forms: Value of the property + :type canonical_forms: string + """ + if not canonical_forms: + if name in self._qualities: + del self._qualities[name] + else: + property_qualities = self._qualities.get(name) + if property_qualities is not None: + for property_canonical_form in canonical_forms: + if property_canonical_form in property_qualities: + del property_qualities[property_canonical_form] + if not property_qualities: + del self._qualities[name] + + def clear_qualities(self,): + """Unregister all defined quality ratings. + """ + self._qualities.clear() + + def rate_quality(self, guess, *props): + """Rate the quality of guess. + + :param guess: Guess to rate + :type guess: :class:`guessit.guess.Guess` + :param props: Properties to include in the rating. if empty, rating will be performed for all guess properties. + :type props: varargs of string + + :return: Quality of the guess. The higher, the better. + :rtype: int + """ + rate = 0 + if not props: + props = guess.keys() + for prop in props: + prop_value = guess.get(prop) + prop_qualities = self._qualities.get(prop) + if prop_value is not None and prop_qualities is not None: + rate += prop_qualities.get(prop_value, 0) + return rate + + def best_quality_properties(self, props, *guesses): + """Retrieve the best quality guess, based on given properties + + :param props: Properties to include in the rating + :type props: list of strings + :param guesses: Guesses to rate + :type guesses: :class:`guessit.guess.Guess` + + :return: Best quality guess from all passed guesses + :rtype: :class:`guessit.guess.Guess` + """ + best_guess = None + best_rate = None + for guess in guesses: + rate = self.rate_quality(guess, *props) + if best_rate is None or best_rate < rate: + best_rate = rate + best_guess = guess + return best_guess + + def best_quality(self, *guesses): + """Retrieve the best quality guess. + + :param guesses: Guesses to rate + :type guesses: :class:`guessit.guess.Guess` + + :return: Best quality guess from all passed guesses + :rtype: :class:`guessit.guess.Guess` + """ + best_guess = None + best_rate = None + for guess in guesses: + rate = self.rate_quality(guess) + if best_rate is None or best_rate < rate: + best_rate = rate + best_guess = guess + return best_guess + diff --git a/libs/guessit/country.py b/libs/guessit/country.py deleted file mode 100644 index 944b7df6..00000000 --- a/libs/guessit/country.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import unicode_literals -from guessit import UnicodeMixin, base_text_type, u -from guessit.fileutils import load_file_in_same_dir -import logging - -__all__ = [ 'Country' ] - -log = logging.getLogger(__name__) - - -# parsed from http://en.wikipedia.org/wiki/ISO_3166-1 -# -# Description of the fields: -# "An English name, an alpha-2 code (when given), -# an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code -# are all separated by pipe (|) characters." -_iso3166_contents = load_file_in_same_dir(__file__, 'ISO-3166-1_utf8.txt') - -country_matrix = [ l.strip().split('|') - for l in _iso3166_contents.strip().split('\n') ] - -country_matrix += [ [ 'Unknown', 'un', 'unk', '', '' ], - [ 'Latin America', '', 'lat', '', '' ] - ] - -country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix) -country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix)) -country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix)) - -# add here exceptions / non ISO representations -# Note: remember to put those exceptions in lower-case, they won't work otherwise -country_to_alpha3.update({ 'latinoamérica': 'lat', - 'brazilian': 'bra', - 'españa': 'esp', - 'uk': 'gbr' - }) - -country_alpha3_to_en_name = dict((c[2].lower(), c[0]) for c in country_matrix) -country_alpha3_to_alpha2 = dict((c[2].lower(), c[1].lower()) for c in country_matrix) - - - -class Country(UnicodeMixin): - """This class represents a country. - - You can initialize it with pretty much anything, as it knows conversion - from ISO-3166 2-letter and 3-letter codes, and an English name. - """ - - def __init__(self, country, strict=False): - country = u(country.strip().lower()) - self.alpha3 = country_to_alpha3.get(country) - - if self.alpha3 is None and strict: - msg = 'The given string "%s" could not be identified as a country' - raise ValueError(msg % country) - - if self.alpha3 is None: - self.alpha3 = 'unk' - - - @property - def alpha2(self): - return country_alpha3_to_alpha2[self.alpha3] - - @property - def english_name(self): - return country_alpha3_to_en_name[self.alpha3] - - def __hash__(self): - return hash(self.alpha3) - - def __eq__(self, other): - if isinstance(other, Country): - return self.alpha3 == other.alpha3 - - if isinstance(other, base_text_type): - try: - return self == Country(other) - except ValueError: - return False - - return False - - def __ne__(self, other): - return not self == other - - def __unicode__(self): - return self.english_name - - def __repr__(self): - return 'Country(%s)' % self.english_name diff --git a/libs/guessit/date.py b/libs/guessit/date.py index bd84c65d..ed38d1ba 100644 --- a/libs/guessit/date.py +++ b/libs/guessit/date.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,15 +18,38 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + import datetime + import re -def valid_year(year): - return 1920 < year < datetime.date.today().year + 5 +from dateutil import parser + + +_dsep = r'[-/ \.]' +_dsep_bis = r'[-/ \.x]' + +date_regexps = [ + re.compile('[^\d](\d{8})[^\d]', re.IGNORECASE), + re.compile('[^\d](\d{6})[^\d]', re.IGNORECASE), + re.compile('[^\d](\d{2})%s(\d{1,2})%s(\d{1,2})[^\d]' % (_dsep, _dsep), re.IGNORECASE), + re.compile('[^\d](\d{1,2})%s(\d{1,2})%s(\d{2})[^\d]' % (_dsep, _dsep), re.IGNORECASE), + re.compile('[^\d](\d{4})%s(\d{1,2})%s(\d{1,2})[^\d]' % (_dsep_bis, _dsep), re.IGNORECASE), + re.compile('[^\d](\d{1,2})%s(\d{1,2})%s(\d{4})[^\d]' % (_dsep, _dsep_bis), re.IGNORECASE), + re.compile('[^\d](\d{1,2}(?:st|nd|rd|th)?%s(?:[a-z]{3,10})%s\d{4})[^\d]' % (_dsep, _dsep), re.IGNORECASE)] + + +def valid_year(year, today=None): + """Check if number is a valid year""" + if not today: + today = datetime.date.today() + return 1920 < year < today.year + 5 + def search_year(string): """Looks for year patterns, and if found return the year and group span. + Assumes there are sentinels at the beginning and end of the string that always allow matching a non-digit delimiting the date. @@ -34,10 +57,10 @@ def search_year(string): and now + 5 years, so for instance 2000 would be returned as a valid year but 1492 would not. - >>> search_year('in the year 2000...') - (2000, (12, 16)) + >>> search_year(' in the year 2000... ') + (2000, (13, 17)) - >>> search_year('they arrived in 1492.') + >>> search_year(' they arrived in 1492. ') (None, None) """ match = re.search(r'[^0-9]([0-9]{4})[^0-9]', string) @@ -49,85 +72,58 @@ def search_year(string): return (None, None) -def search_date(string): +def search_date(string, year_first=None, day_first=True): """Looks for date patterns, and if found return the date and group span. + Assumes there are sentinels at the beginning and end of the string that always allow matching a non-digit delimiting the date. - >>> search_date('This happened on 2002-04-22.') - (datetime.date(2002, 4, 22), (17, 27)) + Year can be defined on two digit only. It will return the nearest possible + date from today. - >>> search_date('And this on 17-06-1998.') - (datetime.date(1998, 6, 17), (12, 22)) + >>> search_date(' This happened on 2002-04-22. ') + (datetime.date(2002, 4, 22), (18, 28)) - >>> search_date('no date in here') + >>> search_date(' And this on 17-06-1998. ') + (datetime.date(1998, 6, 17), (13, 23)) + + >>> search_date(' no date in here ') (None, None) """ + start, end = None, None + match = None + for date_re in date_regexps: + s = date_re.search(string) + if s and (match is None or s.end() - s.start() > len(match)): + start, end = s.start(), s.end() + if date_re.groups: + match = '-'.join(s.groups()) + else: + match = s.group() - dsep = r'[-/ \.]' + if match is None: + return None, None - date_rexps = [ - # 20010823 - r'[^0-9]' + - r'(?P[0-9]{4})' + - r'(?P[0-9]{2})' + - r'(?P[0-9]{2})' + - r'[^0-9]', + today = datetime.date.today() - # 2001-08-23 - r'[^0-9]' + - r'(?P[0-9]{4})' + dsep + - r'(?P[0-9]{2})' + dsep + - r'(?P[0-9]{2})' + - r'[^0-9]', + # If day_first/year_first is undefined, parse is made using both possible values. + yearfirst_opts = [False, True] + if year_first is not None: + yearfirst_opts = [year_first] - # 23-08-2001 - r'[^0-9]' + - r'(?P[0-9]{2})' + dsep + - r'(?P[0-9]{2})' + dsep + - r'(?P[0-9]{4})' + - r'[^0-9]', - - # 23-08-01 - r'[^0-9]' + - r'(?P[0-9]{2})' + dsep + - r'(?P[0-9]{2})' + dsep + - r'(?P[0-9]{2})' + - r'[^0-9]', - ] - - for drexp in date_rexps: - match = re.search(drexp, string) - if match: - d = match.groupdict() - year, month, day = int(d['year']), int(d['month']), int(d['day']) - # years specified as 2 digits should be adjusted here - if year < 100: - if year > (datetime.date.today().year % 100) + 5: - year = 1900 + year - else: - year = 2000 + year + dayfirst_opts = [True, False] + if day_first is not None: + dayfirst_opts = [day_first] + kwargs_list = ({'dayfirst': d, 'yearfirst': y} for d in dayfirst_opts for y in yearfirst_opts) + for kwargs in kwargs_list: + try: + date = parser.parse(match, **kwargs) + except (ValueError, TypeError) as e: #see https://bugs.launchpad.net/dateutil/+bug/1247643 date = None - try: - date = datetime.date(year, month, day) - except ValueError: - try: - date = datetime.date(year, day, month) - except ValueError: - pass - - if date is None: - continue - - # check date plausibility - if not 1900 < date.year < datetime.date.today().year + 5: - continue - - # looks like we have a valid date - # note: span is [+1,-1] because we don't want to include the - # non-digit char - start, end = match.span() - return (date, (start + 1, end - 1)) + pass + # check date plausibility + if date and valid_year(date.year, today=today): + return date.date(), (start+1, end-1) #compensate for sentinels return None, None diff --git a/libs/guessit/fileutils.py b/libs/guessit/fileutils.py index 9531f82a..40110485 100644 --- a/libs/guessit/fileutils.py +++ b/libs/guessit/fileutils.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,7 +18,8 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + from guessit import s, u import os.path import zipfile @@ -44,17 +45,13 @@ def split_path(path): result = [] while True: head, tail = os.path.split(path) - headlen = len(head) - # on Unix systems, the root folder is '/' - if head and head == '/'*headlen and tail == '': - return ['/'] + result + if not head and not tail: + return result - # on Windows, the root folder is a drive letter (eg: 'C:\') or for shares \\ - if ((headlen == 3 and head[1:] == ':\\') or (headlen == 2 and head == '\\\\')) and tail == '': - return [head] + result - - if head == '' and tail == '': + if not tail and head == path: + # Make sure we won't have an infinite loop. + result = [head] + result return result # we just split a directory ending with '/', so tail is empty @@ -70,8 +67,8 @@ def split_path(path): def file_in_same_dir(ref_file, desired_file): """Return the path for a file in the same dir as a given reference file. - >>> s(file_in_same_dir('~/smewt/smewt.db', 'smewt.settings')) - '~/smewt/smewt.settings' + >>> s(file_in_same_dir('~/smewt/smewt.db', 'smewt.settings')) == os.path.normpath('~/smewt/smewt.settings') + True """ return os.path.join(*(split_path(ref_file)[:-1] + [desired_file])) @@ -85,6 +82,6 @@ def load_file_in_same_dir(ref_file, filename): if p.endswith('.zip'): zfilename = os.path.join(*path[:i + 1]) zfile = zipfile.ZipFile(zfilename) - return zfile.read('/'.join(path[i + 1:])) + return u(zfile.read('/'.join(path[i + 1:]))) return u(io.open(os.path.join(*path), encoding='utf-8').read()) diff --git a/libs/guessit/guess.py b/libs/guessit/guess.py index 73babceb..c0f401f2 100644 --- a/libs/guessit/guess.py +++ b/libs/guessit/guess.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,10 +18,10 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + from guessit import UnicodeMixin, s, u, base_text_type -from guessit.language import Language -from guessit.country import Country +from babelfish import Language, Country import json import datetime import logging @@ -29,6 +29,111 @@ import logging log = logging.getLogger(__name__) +class GuessMetadata(object): + """GuessMetadata contains confidence, an input string, span and related property. + + If defined on a property of Guess object, it overrides the object defined as global. + + :param parent: The parent metadata, used for undefined properties in self object + :type parent: :class: `GuessMedata` + :param confidence: The confidence (from 0.0 to 1.0) + :type confidence: number + :param input: The input string + :type input: string + :param span: The input string + :type span: tuple (int, int) + :param prop: The found property definition + :type prop: :class `guessit.containers._Property` + """ + def __init__(self, parent=None, confidence=None, input=None, span=None, prop=None, *args, **kwargs): + self.parent = parent + if confidence is None and self.parent is None: + self._confidence = 1.0 + else: + self._confidence = confidence + self._input = input + self._span = span + self._prop = prop + + @property + def confidence(self): + """The confidence + + :rtype: int + :return: confidence value + """ + return self._confidence if self._confidence is not None else self.parent.confidence if self.parent else None + + @confidence.setter + def confidence(self, confidence): + self._confidence = confidence + + @property + def input(self): + """The input + + :rtype: string + :return: String used to find this guess value + """ + return self._input if self._input is not None else self.parent.input if self.parent else None + + @input.setter + def input(self, input): + """The input + + :rtype: string + """ + self._input = input + + @property + def span(self): + """The span + + :rtype: tuple (int, int) + :return: span of input string used to find this guess value + """ + return self._span if self._span is not None else self.parent.span if self.parent else None + + @span.setter + def span(self, span): + """The span + + :rtype: tuple (int, int) + :return: span of input string used to find this guess value + """ + self._span = span + + @property + def prop(self): + """The property + + :rtype: :class:`_Property` + :return: The property + """ + return self._prop if self._prop is not None else self.parent.prop if self.parent else None + + @property + def raw(self): + """Return the raw information (original match from the string, + not the cleaned version) associated with the given property name.""" + if self.input and self.span: + return self.input[self.span[0]:self.span[1]] + return None + + def __repr__(self, *args, **kwargs): + return object.__repr__(self, *args, **kwargs) + + +def _split_kwargs(**kwargs): + metadata_args = {} + for prop in dir(GuessMetadata): + try: + metadata_args[prop] = kwargs.pop(prop) + except KeyError: + pass + return metadata_args, kwargs + + class Guess(UnicodeMixin, dict): """A Guess is a dictionary which has an associated confidence for each of its values. @@ -37,91 +142,125 @@ class Guess(UnicodeMixin, dict): simple dict.""" def __init__(self, *args, **kwargs): - try: - confidence = kwargs.pop('confidence') - except KeyError: - confidence = 0 - - try: - raw = kwargs.pop('raw') - except KeyError: - raw = None - + metadata_kwargs, kwargs = _split_kwargs(**kwargs) + self._global_metadata = GuessMetadata(**metadata_kwargs) dict.__init__(self, *args, **kwargs) - self._confidence = {} - self._raw = {} + self._metadata = {} for prop in self: - self._confidence[prop] = confidence - self._raw[prop] = raw - + self._metadata[prop] = GuessMetadata(parent=self._global_metadata) + + def rename(self, old_name, new_name): + if old_name in self._metadata: + metadata = self._metadata[old_name] + del self._metadata[old_name] + self._metadata[new_name] = metadata + if old_name in self: + value = self[old_name] + del self[old_name] + self[new_name] = value + return True + return False + def to_dict(self, advanced=False): + """Return the guess as a dict containing only base types, ie: + where dates, languages, countries, etc. are converted to strings. + + if advanced is True, return the data as a json string containing + also the raw information of the properties.""" data = dict(self) for prop, value in data.items(): if isinstance(value, datetime.date): data[prop] = value.isoformat() - elif isinstance(value, (Language, Country, base_text_type)): + elif isinstance(value, (UnicodeMixin, base_text_type)): data[prop] = u(value) + elif isinstance(value, (Language, Country)): + data[prop] = value.guessit elif isinstance(value, list): data[prop] = [u(x) for x in value] if advanced: - data[prop] = {"value": data[prop], "raw": self.raw(prop), "confidence": self.confidence(prop)} + metadata = self.metadata(prop) + prop_data = {'value': data[prop]} + if metadata.raw: + prop_data['raw'] = metadata.raw + if metadata.confidence: + prop_data['confidence'] = metadata.confidence + data[prop] = prop_data return data def nice_string(self, advanced=False): + """Return a string with the property names and their values, + that also displays the associated confidence to each property. + + FIXME: doc with param""" if advanced: data = self.to_dict(advanced) return json.dumps(data, indent=4) - else: + else: data = self.to_dict() - + parts = json.dumps(data, indent=4).split('\n') for i, p in enumerate(parts): if p[:5] != ' "': continue - + prop = p.split('"')[1] parts[i] = (' [%.2f] "' % self.confidence(prop)) + p[5:] - + return '\n'.join(parts) def __unicode__(self): return u(self.to_dict()) - def confidence(self, prop): - return self._confidence.get(prop, -1) - + def metadata(self, prop=None): + """Return the metadata associated with the given property name + + If no property name is given, get the global_metadata + """ + if prop is None: + return self._global_metadata + if prop not in self._metadata: + self._metadata[prop] = GuessMetadata(parent=self._global_metadata) + return self._metadata[prop] + + def confidence(self, prop=None): + return self.metadata(prop).confidence + + def set_confidence(self, prop, confidence): + self.metadata(prop).confidence = confidence + def raw(self, prop): - return self._raw.get(prop, None) + return self.metadata(prop).raw - def set(self, prop, value, confidence=None, raw=None): - self[prop] = value - if confidence is not None: - self._confidence[prop] = confidence - if raw is not None: - self._raw[prop] = raw + def set(self, prop_name, value, *args, **kwargs): + if value is None: + try: + del self[prop_name] + except KeyError: + pass + try: + del self._metadata[prop_name] + except KeyError: + pass + else: + self[prop_name] = value + if 'metadata' in kwargs.keys(): + self._metadata[prop_name] = kwargs['metadata'] + else: + self._metadata[prop_name] = GuessMetadata(parent=self._global_metadata, *args, **kwargs) - def set_confidence(self, prop, value): - self._confidence[prop] = value - - def set_raw(self, prop, value): - self._raw[prop] = value - - def update(self, other, confidence=None, raw=None): + def update(self, other, confidence=None): dict.update(self, other) if isinstance(other, Guess): for prop in other: - self._confidence[prop] = other.confidence(prop) - self._raw[prop] = other.raw(prop) - + try: + self._metadata[prop] = other._metadata[prop] + except KeyError: + pass if confidence is not None: for prop in other: - self._confidence[prop] = confidence - - if raw is not None: - for prop in other: - self._raw[prop] = raw + self.set_confidence(prop, confidence) def update_highest_confidence(self, other): """Update this guess with the values from the given one. In case @@ -131,17 +270,16 @@ class Guess(UnicodeMixin, dict): raise ValueError('Can only call this function on Guess instances') for prop in other: - if prop in self and self.confidence(prop) >= other.confidence(prop): + if prop in self and self.metadata(prop).confidence >= other.metadata(prop).confidence: continue self[prop] = other[prop] - self._confidence[prop] = other.confidence(prop) - self._raw[prop] = other.raw(prop) + self._metadata[prop] = other.metadata(prop) def choose_int(g1, g2): """Function used by merge_similar_guesses to choose between 2 possible properties when they are integers.""" - v1, c1 = g1 # value, confidence + v1, c1 = g1 # value, confidence v2, c2 = g2 if (v1 == v2): return (v1, 1 - (1 - c1) * (1 - c2)) @@ -179,7 +317,7 @@ def choose_string(g1, g2): ('The Simpsons', 0.75) """ - v1, c1 = g1 # value, confidence + v1, c1 = g1 # value, confidence v2, c2 = g2 if not v1: @@ -193,26 +331,26 @@ def choose_string(g1, g2): combined_prob = 1 - (1 - c1) * (1 - c2) if v1l == v2l: - return (v1, combined_prob) + return v1, combined_prob # check for common patterns elif v1l == 'the ' + v2l: - return (v1, combined_prob) + return v1, combined_prob elif v2l == 'the ' + v1l: - return (v2, combined_prob) + return v2, combined_prob # if one string is contained in the other, return the shortest one elif v2l in v1l: - return (v2, combined_prob) + return v2, combined_prob elif v1l in v2l: - return (v1, combined_prob) + return v1, combined_prob # in case of conflict, return the one with highest confidence else: if c1 > c2: - return (v1, c1 - c2) + return v1, c1 - c2 else: - return (v2, c2 - c1) + return v2, c2 - c1 def _merge_similar_guesses_nocheck(guesses, prop, choose): @@ -226,17 +364,7 @@ def _merge_similar_guesses_nocheck(guesses, prop, choose): g1, g2 = similar[0], similar[1] - other_props = set(g1) & set(g2) - set([prop]) - if other_props: - log.debug('guess 1: %s' % g1) - log.debug('guess 2: %s' % g2) - for prop in other_props: - if g1[prop] != g2[prop]: - log.warning('both guesses to be merged have more than one ' - 'different property in common, bailing out...') - return - - # merge all props of s2 into s1, updating the confidence for the + # merge only this prop of s2 into s1, updating the confidence for the # considered property v1, v2 = g1[prop], g2[prop] c1, c2 = g1.confidence(prop), g2.confidence(prop) @@ -248,11 +376,12 @@ def _merge_similar_guesses_nocheck(guesses, prop, choose): msg = "Updating non-matching property '%s' with confidence %.2f" log.debug(msg % (prop, new_confidence)) - g2[prop] = new_value - g2.set_confidence(prop, new_confidence) + g1.set(prop, new_value, confidence=new_confidence) + g2.pop(prop) - g1.update(g2) - guesses.remove(g2) + # remove g2 if there are no properties left + if not g2.keys(): + guesses.remove(g2) def merge_similar_guesses(guesses, prop, choose): @@ -286,43 +415,53 @@ def merge_all(guesses, append=None): instead of being merged. >>> s(merge_all([ Guess({'season': 2}, confidence=0.6), - ... Guess({'episodeNumber': 13}, confidence=0.8) ])) - {'season': 2, 'episodeNumber': 13} + ... Guess({'episodeNumber': 13}, confidence=0.8) ]) + ... ) == {'season': 2, 'episodeNumber': 13} + True + >>> s(merge_all([ Guess({'episodeNumber': 27}, confidence=0.02), - ... Guess({'season': 1}, confidence=0.2) ])) - {'season': 1} + ... Guess({'season': 1}, confidence=0.2) ]) + ... ) == {'season': 1} + True >>> s(merge_all([ Guess({'other': 'PROPER'}, confidence=0.8), ... Guess({'releaseGroup': '2HD'}, confidence=0.8) ], - ... append=['other'])) - {'releaseGroup': '2HD', 'other': ['PROPER']} - + ... append=['other']) + ... ) == {'releaseGroup': '2HD', 'other': ['PROPER']} + True """ + result = Guess() if not guesses: - return Guess() + return result - result = guesses[0] if append is None: append = [] - for g in guesses[1:]: + for g in guesses: # first append our appendable properties for prop in append: if prop in g: - result.set(prop, result.get(prop, []) + [g[prop]], + if isinstance(g[prop], (list, set)): + new_values = result.get(prop, []) + list(g[prop]) + else: + new_values = result.get(prop, []) + [g[prop]] + + result.set(prop, new_values, # TODO: what to do with confidence here? maybe an # arithmetic mean... - confidence=g.confidence(prop), - raw=g.raw(prop)) + confidence=g.metadata(prop).confidence, + input=g.metadata(prop).input, + span=g.metadata(prop).span, + prop=g.metadata(prop).prop) del g[prop] # then merge the remaining ones dups = set(result) & set(g) if dups: - log.warning('duplicate properties %s in merged result...' % [ (result[p], g[p]) for p in dups] ) + log.debug('duplicate properties %s in merged result...' % [(result[p], g[p]) for p in dups]) result.update_highest_confidence(g) @@ -338,8 +477,38 @@ def merge_all(guesses, append=None): if isinstance(value, list): result[prop] = list(set(value)) else: - result[prop] = [ value ] + result[prop] = [value] except KeyError: pass return result + + +def smart_merge(guesses): + """First tries to merge well-known similar properties, and then merges + the rest with a merge_all call. + + Should be the function to call in most cases, unless one wants to have more + control. + + Warning: this function is destructive, ie: it will merge the list in-place. + """ + + # 1- try to merge similar information together and give it a higher + # confidence + for int_part in ('year', 'season', 'episodeNumber'): + merge_similar_guesses(guesses, int_part, choose_int) + + for string_part in ('title', 'series', 'container', 'format', + 'releaseGroup', 'website', 'audioCodec', + 'videoCodec', 'screenSize', 'episodeFormat', + 'audioChannels', 'idNumber'): + merge_similar_guesses(guesses, string_part, choose_string) + + # 2- merge the rest, potentially discarding information not properly + # merged before + result = merge_all(guesses, + append=['language', 'subtitleLanguage', 'other', + 'episodeDetails', 'unidentified']) + + return result diff --git a/libs/guessit/hash_ed2k.py b/libs/guessit/hash_ed2k.py index 7422d4e9..a1ea562f 100644 --- a/libs/guessit/hash_ed2k.py +++ b/libs/guessit/hash_ed2k.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,17 +18,21 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + from guessit import s, to_hex import hashlib import os.path +from functools import reduce + def hash_file(filename): """Returns the ed2k hash of a given file. - >>> s(hash_file('tests/dummy.srt')) - 'ed2k://|file|dummy.srt|44|1CA0B9DED3473B926AA93A0A546138BB|/' + >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt') + >>> s(hash_file(testfile)) + 'ed2k://|file|dummy.srt|59|41F58B913AB3973F593BEBA8B8DF6510|/' """ return 'ed2k://|file|%s|%d|%s|/' % (os.path.basename(filename), os.path.getsize(filename), diff --git a/libs/guessit/hash_mpc.py b/libs/guessit/hash_mpc.py index c9dd4292..fb6c52bd 100644 --- a/libs/guessit/hash_mpc.py +++ b/libs/guessit/hash_mpc.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,7 +18,8 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + import struct import os @@ -28,7 +29,7 @@ def hash_file(filename): http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes and is licensed under the GPL.""" - longlongformat = 'q' # long long + longlongformat = b'q' # long long bytesize = struct.calcsize(longlongformat) f = open(filename, "rb") @@ -39,18 +40,18 @@ def hash_file(filename): if filesize < 65536 * 2: raise Exception("SizeError: size is %d, should be > 132K..." % filesize) - for x in range(65536 / bytesize): + for x in range(int(65536 / bytesize)): buf = f.read(bytesize) (l_value,) = struct.unpack(longlongformat, buf) hash_value += l_value - hash_value = hash_value & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number + hash_value &= 0xFFFFFFFFFFFFFFFF # to remain as 64bit number f.seek(max(0, filesize - 65536), 0) - for x in range(65536 / bytesize): + for x in range(int(65536 / bytesize)): buf = f.read(bytesize) (l_value,) = struct.unpack(longlongformat, buf) hash_value += l_value - hash_value = hash_value & 0xFFFFFFFFFFFFFFFF + hash_value &= 0xFFFFFFFFFFFFFFFF f.close() diff --git a/libs/guessit/language.py b/libs/guessit/language.py index 4d22cf05..7e32af3c 100644 --- a/libs/guessit/language.py +++ b/libs/guessit/language.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,373 +18,284 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import UnicodeMixin, base_text_type, u, s -from guessit.fileutils import load_file_in_same_dir +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit import UnicodeMixin, base_text_type, u from guessit.textutils import find_words -from guessit.country import Country +from babelfish import Language, Country +import babelfish import re import logging +from guessit.guess import Guess -__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language', - 'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED', - 'search_language', 'guess_language' ] - +__all__ = ['Language', 'UNDETERMINED', + 'search_language', 'guess_language'] log = logging.getLogger(__name__) +UNDETERMINED = babelfish.Language('und') -# downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt -# -# Description of the fields: -# "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given), -# an alpha-2 code (when given), an English name, and a French name of a language -# are all separated by pipe (|) characters." -_iso639_contents = load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt') - -# drop the BOM from the beginning of the file -_iso639_contents = _iso639_contents[1:] - -language_matrix = [ l.strip().split('|') - for l in _iso639_contents.strip().split('\n') ] +SYN = {('und', None): ['unknown', 'inconnu', 'unk', 'un'], + ('ell', None): ['gr', 'greek'], + ('spa', None): ['esp', 'español'], + ('fra', None): ['français', 'vf', 'vff', 'vfi'], + ('swe', None): ['se'], + ('por', 'BR'): ['po', 'pb', 'pob', 'br', 'brazilian'], + ('cat', None): ['català'], + ('ces', None): ['cz'], + ('ukr', None): ['ua'], + ('zho', None): ['cn'], + ('jpn', None): ['jp'], + ('hrv', None): ['scr'], + ('mul', None): ['multi', 'dl'], # http://scenelingo.wordpress.com/2009/03/24/what-does-dl-mean/ + } -# update information in the language matrix -language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'], - ['ass', '', '', 'Assyrian', 'assyrien']] - -for lang in language_matrix: - # remove unused languages that shadow other common ones with a non-official form - if (lang[2] == 'se' or # Northern Sami shadows Swedish - lang[2] == 'br'): # Breton shadows Brazilian - lang[2] = '' - # add missing information - if lang[0] == 'und': - lang[2] = 'un' - if lang[0] == 'srp': - lang[1] = 'scc' # from OpenSubtitles - - -lng3 = frozenset(l[0] for l in language_matrix if l[0]) -lng3term = frozenset(l[1] for l in language_matrix if l[1]) -lng2 = frozenset(l[2] for l in language_matrix if l[2]) -lng_en_name = frozenset(lng for l in language_matrix - for lng in l[3].lower().split('; ') if lng) -lng_fr_name = frozenset(lng for l in language_matrix - for lng in l[4].lower().split('; ') if lng) -lng_all_names = lng3 | lng3term | lng2 | lng_en_name | lng_fr_name - -lng3_to_lng3term = dict((l[0], l[1]) for l in language_matrix if l[1]) -lng3term_to_lng3 = dict((l[1], l[0]) for l in language_matrix if l[1]) - -lng3_to_lng2 = dict((l[0], l[2]) for l in language_matrix if l[2]) -lng2_to_lng3 = dict((l[2], l[0]) for l in language_matrix if l[2]) - -# we only return the first given english name, hoping it is the most used one -lng3_to_lng_en_name = dict((l[0], l[3].split('; ')[0]) - for l in language_matrix if l[3]) -lng_en_name_to_lng3 = dict((en_name.lower(), l[0]) - for l in language_matrix if l[3] - for en_name in l[3].split('; ')) - -# we only return the first given french name, hoping it is the most used one -lng3_to_lng_fr_name = dict((l[0], l[4].split('; ')[0]) - for l in language_matrix if l[4]) -lng_fr_name_to_lng3 = dict((fr_name.lower(), l[0]) - for l in language_matrix if l[4] - for fr_name in l[4].split('; ')) - -# contains a list of exceptions: strings that should be parsed as a language -# but which are not in an ISO form -lng_exceptions = { 'unknown': ('und', None), - 'inconnu': ('und', None), - 'unk': ('und', None), - 'un': ('und', None), - 'gr': ('gre', None), - 'greek': ('gre', None), - 'esp': ('spa', None), - 'español': ('spa', None), - 'se': ('swe', None), - 'po': ('pt', 'br'), - 'pb': ('pt', 'br'), - 'pob': ('pt', 'br'), - 'br': ('pt', 'br'), - 'brazilian': ('pt', 'br'), - 'català': ('cat', None), - 'cz': ('cze', None), - 'ua': ('ukr', None), - 'cn': ('chi', None), - 'chs': ('chi', None), - 'jp': ('jpn', None), - 'scr': ('hrv', None) - } - - -def is_iso_language(language): - return language.lower() in lng_all_names - -def is_language(language): - return is_iso_language(language) or language in lng_exceptions - -def lang_set(languages, strict=False): - """Return a set of guessit.Language created from their given string - representation. - - if strict is True, then this will raise an exception if any language - could not be identified. - """ - return set(Language(l, strict=strict) for l in languages) - - -class Language(UnicodeMixin): - """This class represents a human language. - - You can initialize it with pretty much anything, as it knows conversion - from ISO-639 2-letter and 3-letter codes, English and French names. - - You can also distinguish languages for specific countries, such as - Portuguese and Brazilian Portuguese. - - There are various properties on the language object that give you the - representation of the language for a specific usage, such as .alpha3 - to get the ISO 3-letter code, or .opensubtitles to get the OpenSubtitles - language code. - - >>> Language('fr') - Language(French) - - >>> s(Language('eng').french_name) - 'anglais' - - >>> s(Language('pt(br)').country.english_name) - 'Brazil' - - >>> s(Language('Español (Latinoamérica)').country.english_name) - 'Latin America' - - >>> Language('Spanish (Latin America)') == Language('Español (Latinoamérica)') - True - - >>> s(Language('zz', strict=False).english_name) - 'Undetermined' - - >>> s(Language('pt(br)').opensubtitles) - 'pob' - """ +class GuessitConverter(babelfish.LanguageReverseConverter): _with_country_regexp = re.compile('(.*)\((.*)\)') _with_country_regexp2 = re.compile('(.*)-(.*)') - def __init__(self, language, country=None, strict=False, scheme=None): - language = u(language.strip().lower()) - with_country = (Language._with_country_regexp.match(language) or - Language._with_country_regexp2.match(language)) + def __init__(self): + self.guessit_exceptions = {} + for (alpha3, country), synlist in SYN.items(): + for syn in synlist: + self.guessit_exceptions[syn.lower()] = (alpha3, country, None) + + @property + def codes(self): + return (babelfish.language_converters['alpha3b'].codes | + babelfish.language_converters['alpha2'].codes | + babelfish.language_converters['name'].codes | + babelfish.language_converters['opensubtitles'].codes | + babelfish.country_converters['name'].codes | + frozenset(self.guessit_exceptions.keys())) + + def convert(self, alpha3, country=None, script=None): + return str(babelfish.Language(alpha3, country, script)) + + def reverse(self, name): + with_country = (GuessitConverter._with_country_regexp.match(name) or + GuessitConverter._with_country_regexp2.match(name)) + + name = u(name.lower()) if with_country: - self.lang = Language(with_country.group(1)).lang - self.country = Country(with_country.group(2)) - return + lang = Language.fromguessit(with_country.group(1).strip()) + lang.country = babelfish.Country.fromguessit(with_country.group(2).strip()) + return (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None) - self.lang = None - self.country = Country(country) if country else None + # exceptions come first, as they need to override a potential match + # with any of the other guessers + try: + return self.guessit_exceptions[name] + except KeyError: + pass - # first look for scheme specific languages - if scheme == 'opensubtitles': - if language == 'br': - self.lang = 'bre' - return - elif language == 'se': - self.lang = 'sme' - return - elif scheme is not None: - log.warning('Unrecognized scheme: "%s" - Proceeding with standard one' % scheme) - - # look for ISO language codes - if len(language) == 2: - self.lang = lng2_to_lng3.get(language) - elif len(language) == 3: - self.lang = (language - if language in lng3 - else lng3term_to_lng3.get(language)) - else: - self.lang = (lng_en_name_to_lng3.get(language) or - lng_fr_name_to_lng3.get(language)) - - # general language exceptions - if self.lang is None and language in lng_exceptions: - lang, country = lng_exceptions[language] - self.lang = Language(lang).alpha3 - self.country = Country(country) if country else None - - msg = 'The given string "%s" could not be identified as a language' % language - - if self.lang is None and strict: - raise ValueError(msg) - - if self.lang is None: - log.debug(msg) - self.lang = 'und' - - @property - def alpha2(self): - return lng3_to_lng2[self.lang] - - @property - def alpha3(self): - return self.lang - - @property - def alpha3term(self): - return lng3_to_lng3term[self.lang] - - @property - def english_name(self): - return lng3_to_lng_en_name[self.lang] - - @property - def french_name(self): - return lng3_to_lng_fr_name[self.lang] - - @property - def opensubtitles(self): - if self.lang == 'por' and self.country and self.country.alpha2 == 'br': - return 'pob' - elif self.lang in ['gre', 'srp']: - return self.alpha3term - return self.alpha3 - - @property - def tmdb(self): - if self.country: - return '%s-%s' % (self.alpha2, self.country.alpha2.upper()) - return self.alpha2 - - def __hash__(self): - return hash(self.lang) - - def __eq__(self, other): - if isinstance(other, Language): - return self.lang == other.lang - - if isinstance(other, base_text_type): + for conv in [babelfish.Language, + babelfish.Language.fromalpha3b, + babelfish.Language.fromalpha2, + babelfish.Language.fromname, + babelfish.Language.fromopensubtitles]: try: - return self == Language(other) - except ValueError: - return False + c = conv(name) + return c.alpha3, c.country, c.script + except (ValueError, babelfish.LanguageReverseError): + pass - return False - - def __ne__(self, other): - return not self == other - - def __nonzero__(self): - return self.lang != 'und' - - def __unicode__(self): - if self.country: - return '%s(%s)' % (self.english_name, self.country.alpha2) - else: - return self.english_name - - def __repr__(self): - if self.country: - return 'Language(%s, country=%s)' % (self.english_name, self.country) - else: - return 'Language(%s)' % self.english_name + raise babelfish.LanguageReverseError(name) -UNDETERMINED = Language('und') -ALL_LANGUAGES = frozenset(Language(lng) for lng in lng_all_names) - frozenset([UNDETERMINED]) -ALL_LANGUAGES_NAMES = lng_all_names +babelfish.language_converters['guessit'] = GuessitConverter() -def search_language(string, lang_filter=None, skip=None): +COUNTRIES_SYN = {'ES': ['españa'], + 'GB': ['UK'], + 'BR': ['brazilian', 'bra'], + # FIXME: this one is a bit of a stretch, not sure how to do + # it properly, though... + 'MX': ['Latinoamérica', 'latin america'] + } + + +class GuessitCountryConverter(babelfish.CountryReverseConverter): + def __init__(self): + self.guessit_exceptions = {} + + for alpha2, synlist in COUNTRIES_SYN.items(): + for syn in synlist: + self.guessit_exceptions[syn.lower()] = alpha2 + + @property + def codes(self): + return (babelfish.country_converters['name'].codes | + frozenset(babelfish.COUNTRIES.values()) | + frozenset(self.guessit_exceptions.keys())) + + def convert(self, alpha2): + if alpha2 == 'GB': + return 'UK' + return str(Country(alpha2)) + + def reverse(self, name): + # exceptions come first, as they need to override a potential match + # with any of the other guessers + try: + return self.guessit_exceptions[name.lower()] + except KeyError: + pass + + try: + return babelfish.Country(name.upper()).alpha2 + except ValueError: + pass + + for conv in [babelfish.Country.fromname]: + try: + return conv(name).alpha2 + except babelfish.CountryReverseError: + pass + + raise babelfish.CountryReverseError(name) + + +babelfish.country_converters['guessit'] = GuessitCountryConverter() + + +# list of common words which could be interpreted as languages, but which +# are far too common to be able to say they represent a language in the +# middle of a string (where they most likely carry their commmon meaning) +LNG_COMMON_WORDS = frozenset([ + # english words + 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', + 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', + 'fry', 'cop', 'zen', 'gay', 'fat', 'one', 'cherokee', 'got', 'an', 'as', + 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi', 'bb', 'bt', + 'tv', 'aw', 'by', 'md', 'mp', 'cd', 'lt', 'gt', 'in', 'ad', 'ice', 'ay', + # french words + 'bas', 'de', 'le', 'son', 'ne', 'ca', 'ce', 'et', 'que', + 'mal', 'est', 'vol', 'or', 'mon', 'se', 'je', 'tu', 'me', + 'ne', 'ma', 'va', 'au', + # japanese words, + 'wa', 'ga', 'ao', + # spanish words + 'la', 'el', 'del', 'por', 'mar', + # other + 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', + 'vi', 'ben', 'da', 'lt', 'ch', + # new from babelfish + 'mkv', 'avi', 'dmd', 'the', 'dis', 'cut', 'stv', 'des', 'dia', 'and', + 'cab', 'sub', 'mia', 'rim', 'las', 'une', 'par', 'srt', 'ano', 'toy', + 'job', 'gag', 'reel', 'www', 'for', 'ayu', 'csi', 'ren', 'moi', 'sur', + 'fer', 'fun', 'two', 'big', 'psy', 'air', + # movie title + 'brazil', + # release groups + 'bs', # Bosnian + 'kz', + # countries + 'gt', 'lt', + # part/pt + 'pt' + ]) + +LNG_COMMON_WORDS_STRICT = frozenset(['brazil']) + + +subtitle_prefixes = ['sub', 'subs', 'st', 'vost', 'subforced', 'fansub', 'hardsub'] +subtitle_suffixes = ['subforced', 'fansub', 'hardsub'] +lang_prefixes = ['true'] + + +def find_possible_languages(string, allowed_languages=None): + """Find possible languages in the string + + :return: list of tuple (property, Language, lang_word, word) + """ + + common_words = None + if allowed_languages: + common_words = LNG_COMMON_WORDS_STRICT + else: + common_words = LNG_COMMON_WORDS + + words = find_words(string) + + valid_words = [] + for word in words: + lang_word = word.lower() + key = 'language' + for prefix in subtitle_prefixes: + if lang_word.startswith(prefix): + lang_word = lang_word[len(prefix):] + key = 'subtitleLanguage' + for suffix in subtitle_suffixes: + if lang_word.endswith(suffix): + lang_word = lang_word[:len(suffix)] + key = 'subtitleLanguage' + for prefix in lang_prefixes: + if lang_word.startswith(prefix): + lang_word = lang_word[len(prefix):] + if lang_word not in common_words: + try: + lang = Language.fromguessit(lang_word) + if allowed_languages: + if lang.name.lower() in allowed_languages or lang.alpha2.lower() in allowed_languages or lang.alpha3.lower() in allowed_languages: + valid_words.append((key, lang, lang_word, word)) + # Keep language with alpha2 equivalent. Others are probably + # uncommon languages. + elif lang == 'mul' or hasattr(lang, 'alpha2'): + valid_words.append((key, lang, lang_word, word)) + except babelfish.Error: + pass + return valid_words + + +def search_language(string, allowed_languages=None): """Looks for language patterns, and if found return the language object, its group span and an associated confidence. you can specify a list of allowed languages using the lang_filter argument, as in lang_filter = [ 'fr', 'eng', 'spanish' ] - >>> search_language('movie [en].avi') - (Language(English), (7, 9), 0.8) + >>> search_language('movie [en].avi')['language'] + + + >>> search_language('the zen fat cat and the gay mad men got a new fan', allowed_languages = ['en', 'fr', 'es']) - >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es']) - (None, None, None) """ - # list of common words which could be interpreted as languages, but which - # are far too common to be able to say they represent a language in the - # middle of a string (where they most likely carry their commmon meaning) - lng_common_words = frozenset([ - # english words - 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', - 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', - 'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as', - 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi', - # french words - 'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que', - 'mal', 'est', 'vol', 'or', 'mon', 'se', - # spanish words - 'la', 'el', 'del', 'por', 'mar', - # other - 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', - 'vi', 'ben', 'da', 'lt' - ]) - sep = r'[](){} \._-+' + if allowed_languages: + allowed_languages = set(Language.fromguessit(lang) for lang in allowed_languages) - if lang_filter: - lang_filter = lang_set(lang_filter) + confidence = 1.0 # for all of them - slow = ' %s ' % string.lower() - confidence = 1.0 # for all of them + for prop, language, lang, word in find_possible_languages(string, allowed_languages): + pos = string.find(word) + end = pos + len(word) - for lang in set(find_words(slow)) & lng_all_names: + # only allow those languages that have a 2-letter code, those that + # don't are too esoteric and probably false matches + # if language.lang not in lng3_to_lng2: + # continue - if lang in lng_common_words: - continue + # confidence depends on alpha2, alpha3, english name, ... + if len(lang) == 2: + confidence = 0.8 + elif len(lang) == 3: + confidence = 0.9 + elif prop == 'subtitleLanguage': + confidence = 0.6 # Subtitle prefix found with language + else: + # Note: we could either be really confident that we found a + # language or assume that full language names are too + # common words and lower their confidence accordingly + confidence = 0.3 # going with the low-confidence route here - pos = slow.find(lang) + return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end)) - if pos != -1: - end = pos + len(lang) - - # skip if span in in skip list - while skip and (pos - 1, end - 1) in skip: - pos = slow.find(lang, end) - if pos == -1: - continue - end = pos + len(lang) - if pos == -1: - continue - - # make sure our word is always surrounded by separators - if slow[pos - 1] not in sep or slow[end] not in sep: - continue - - language = Language(slow[pos:end]) - if lang_filter and language not in lang_filter: - continue - - # only allow those languages that have a 2-letter code, those that - # don't are too esoteric and probably false matches - if language.lang not in lng3_to_lng2: - continue - - # confidence depends on lng2, lng3, english name, ... - if len(lang) == 2: - confidence = 0.8 - elif len(lang) == 3: - confidence = 0.9 - else: - # Note: we could either be really confident that we found a - # language or assume that full language names are too - # common words and lower their confidence accordingly - confidence = 0.3 # going with the low-confidence route here - - return language, (pos - 1, end - 1), confidence - - return None, None, None + return None -def guess_language(text): +def guess_language(text): # pragma: no cover """Guess the language in which a body of text is written. This uses the external guess-language python module, and will fail and return @@ -392,7 +303,7 @@ def guess_language(text): """ try: from guess_language import guessLanguage - return Language(guessLanguage(text)) + return Language.fromguessit(guessLanguage(text)) except ImportError: log.error('Cannot detect the language of the given text body, missing dependency: guess-language') diff --git a/libs/guessit/matcher.py b/libs/guessit/matcher.py index 1984c01c..2e3bc2af 100644 --- a/libs/guessit/matcher.py +++ b/libs/guessit/matcher.py @@ -2,7 +2,8 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2013 Rémi Alvergnat # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,163 +19,288 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import PY3, u, base_text_type -from guessit.matchtree import MatchTree -from guessit.textutils import normalize_unicode, clean_string +from __future__ import absolute_import, division, print_function, \ + unicode_literals + import logging +from guessit import PY3, u +from guessit.transfo import TransformerException +from guessit.matchtree import MatchTree +from guessit.textutils import normalize_unicode, clean_default +from guessit.guess import Guess +import inspect + log = logging.getLogger(__name__) class IterativeMatcher(object): - def __init__(self, filename, filetype='autodetect', opts=None, transfo_opts=None): - """An iterative matcher tries to match different patterns that appear - in the filename. + """An iterative matcher tries to match different patterns that appear + in the filename. - The 'filetype' argument indicates which type of file you want to match. - If it is 'autodetect', the matcher will try to see whether it can guess - that the file corresponds to an episode, or otherwise will assume it is - a movie. + The ``filetype`` argument indicates which type of file you want to match. + If it is undefined, the matcher will try to see whether it can guess + that the file corresponds to an episode, or otherwise will assume it is + a movie. - The recognized 'filetype' values are: - [ autodetect, subtitle, info, movie, moviesubtitle, movieinfo, episode, - episodesubtitle, episodeinfo ] + The recognized ``filetype`` values are: + ``['subtitle', 'info', 'movie', 'moviesubtitle', 'movieinfo', 'episode', + 'episodesubtitle', 'episodeinfo']`` + ``options`` is a dict of options values to be passed to the transformations used + by the matcher. - The IterativeMatcher works mainly in 2 steps: + The IterativeMatcher works mainly in 2 steps: - First, it splits the filename into a match_tree, which is a tree of groups - which have a semantic meaning, such as episode number, movie title, - etc... + First, it splits the filename into a match_tree, which is a tree of groups + which have a semantic meaning, such as episode number, movie title, + etc... - The match_tree created looks like the following: + The match_tree created looks like the following:: - 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 - 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 - 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 - __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ - xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc - [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv + 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 + 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 + 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 + __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ + xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc + [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv - The first 3 lines indicates the group index in which a char in the - filename is located. So for instance, x264 is the group (0, 4, 1), and - it corresponds to a video codec, denoted by the letter'v' in the 4th line. - (for more info, see guess.matchtree.to_string) + The first 3 lines indicates the group index in which a char in the + filename is located. So for instance, ``x264`` (in the middle) is the group (0, 4, 1), and + it corresponds to a video codec, denoted by the letter ``v`` in the 4th line. + (for more info, see guess.matchtree.to_string) - Second, it tries to merge all this information into a single object - containing all the found properties, and does some (basic) conflict - resolution when they arise. - - - When you create the Matcher, you can pass it: - - a list 'opts' of option names, that act as global flags - - a dict 'transfo_opts' of { transfo_name: (transfo_args, transfo_kwargs) } - with which to call the transfo.process() function. - """ - - valid_filetypes = ('autodetect', 'subtitle', 'info', 'video', - 'movie', 'moviesubtitle', 'movieinfo', - 'episode', 'episodesubtitle', 'episodeinfo') - if filetype not in valid_filetypes: - raise ValueError("filetype needs to be one of %s" % valid_filetypes) + Second, it tries to merge all this information into a single object + containing all the found properties, and does some (basic) conflict + resolution when they arise. + """ + def __init__(self, filename, options=None, **kwargs): + options = dict(options or {}) + for k, v in kwargs.items(): + if k not in options or not options[k]: + options[k] = v # options dict has priority over keyword arguments + self._validate_options(options) if not PY3 and not isinstance(filename, unicode): log.warning('Given filename to matcher is not unicode...') filename = filename.decode('utf-8') filename = normalize_unicode(filename) + if options and options.get('clean_function'): + clean_function = options.get('clean_function') + if not hasattr(clean_function, '__call__'): + module, function = clean_function.rsplit('.') + if not module: + module = 'guessit.textutils' + clean_function = getattr(__import__(module), function) + if not clean_function: + log.error('Can\'t find clean function %s. Default will be used.' % options.get('clean_function')) + clean_function = clean_default + else: + clean_function = clean_default - if opts is None: - opts = [] - if not isinstance(opts, list): - raise ValueError('opts must be a list of option names! Received: type=%s val=%s', - type(opts), opts) - - if transfo_opts is None: - transfo_opts = {} - if not isinstance(transfo_opts, dict): - raise ValueError('transfo_opts must be a dict of { transfo_name: (args, kwargs) }. '+ - 'Received: type=%s val=%s', type(transfo_opts), transfo_opts) - - self.match_tree = MatchTree(filename) + self.match_tree = MatchTree(filename, clean_function=clean_function) + self.options = options + self._transfo_calls = [] # sanity check: make sure we don't process a (mostly) empty string - if clean_string(filename) == '': + if clean_function(filename).strip() == '': return - mtree = self.match_tree - mtree.guess.set('type', filetype, confidence=1.0) + from guessit.plugins import transformers - def apply_transfo(transfo_name, *args, **kwargs): - transfo = __import__('guessit.transfo.' + transfo_name, - globals=globals(), locals=locals(), - fromlist=['process'], level=0) - default_args, default_kwargs = transfo_opts.get(transfo_name, ((), {})) - all_args = args or default_args - all_kwargs = dict(default_kwargs) - all_kwargs.update(kwargs) # keep all kwargs merged together - transfo.process(mtree, *all_args, **all_kwargs) + try: + mtree = self.match_tree + if 'type' in self.options: + mtree.guess.set('type', self.options['type'], confidence=0.0) - # 1- first split our path into dirs + basename + ext - apply_transfo('split_path_components') + # Process + for transformer in transformers.all_transformers(): + disabled = options.get('disabled_transformers') + if not disabled or transformer.name not in disabled: + self._process(transformer, False) - # 2- guess the file type now (will be useful later) - apply_transfo('guess_filetype', filetype) - if mtree.guess['type'] == 'unknown': - return + # Post-process + for transformer in transformers.all_transformers(): + disabled = options.get('disabled_transformers') + if not disabled or transformer.name not in disabled: + self._process(transformer, True) - # 3- split each of those into explicit groups (separated by parentheses - # or square brackets) - apply_transfo('split_explicit_groups') + log.debug('Found match tree:\n%s' % u(mtree)) + except TransformerException as e: + log.debug('An error has occurred in Transformer %s: %s' % (e.transformer, e)) - # 4- try to match information for specific patterns - # NOTE: order needs to comply to the following: - # - website before language (eg: tvu.org.ru vs russian) - # - language before episodes_rexps - # - properties before language (eg: he-aac vs hebrew) - # - release_group before properties (eg: XviD-?? vs xvid) - if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'): - strategy = [ 'guess_date', 'guess_website', 'guess_release_group', - 'guess_properties', 'guess_language', - 'guess_video_rexps', - 'guess_episodes_rexps', 'guess_weak_episodes_rexps' ] - else: - strategy = [ 'guess_date', 'guess_website', 'guess_release_group', - 'guess_properties', 'guess_language', - 'guess_video_rexps' ] + def _process(self, transformer, post=False): - if 'nolanguage' in opts: - strategy.remove('guess_language') + if not hasattr(transformer, 'should_process') or transformer.should_process(self.match_tree, self.options): + if post: + transformer.post_process(self.match_tree, self.options) + else: + transformer.process(self.match_tree, self.options) + self._transfo_calls.append(transformer) + @property + def second_pass_options(self): + second_pass_options = {} + for transformer in self._transfo_calls: + if hasattr(transformer, 'second_pass_options'): + transformer_second_pass_options = transformer.second_pass_options(self.match_tree, self.options) + if transformer_second_pass_options: + second_pass_options.update(transformer_second_pass_options) - for name in strategy: - apply_transfo(name) + return second_pass_options - # more guessers for both movies and episodes - apply_transfo('guess_bonus_features') - apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts)) + def _validate_options(self, options): + valid_filetypes = ('subtitle', 'info', 'video', + 'movie', 'moviesubtitle', 'movieinfo', + 'episode', 'episodesubtitle', 'episodeinfo') - if 'nocountry' not in opts: - apply_transfo('guess_country') - - apply_transfo('guess_idnumber') - - - # split into '-' separated subgroups (with required separator chars - # around the dash) - apply_transfo('split_on_dash') - - # 5- try to identify the remaining unknown groups by looking at their - # position relative to other known elements - if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'): - apply_transfo('guess_episode_info_from_position') - else: - apply_transfo('guess_movie_title_from_position') - - # 6- perform some post-processing steps - apply_transfo('post_process') - - log.debug('Found match tree:\n%s' % u(mtree)) + type_ = options.get('type') + if type_ and type_ not in valid_filetypes: + raise ValueError("filetype needs to be one of %s" % (valid_filetypes,)) def matched(self): return self.match_tree.matched() + + +def build_guess(node, name, value=None, confidence=1.0): + guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence) + guess.metadata().input = node.value if value is None else value + if value is None: + left_offset = 0 + right_offset = 0 + + clean_value = node.clean_value + + for i in range(0, len(node.value)): + if clean_value[0] == node.value[i]: + break + left_offset += 1 + + for i in reversed(range(0, len(node.value))): + if clean_value[-1] == node.value[i]: + break + right_offset += 1 + + guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset) + return guess + + +def found_property(node, name, value=None, confidence=1.0, update_guess=True, logger=None): + # automatically retrieve the log object from the caller frame + if not logger: + caller_frame = inspect.stack()[1][0] + logger = caller_frame.f_locals['self'].log + guess = build_guess(node, name, value, confidence) + return found_guess(node, guess, update_guess=update_guess, logger=logger) + + +def found_guess(node, guess, update_guess=True, logger=None): + if node.guess: + if update_guess: + node.guess.update_highest_confidence(guess) + else: + child = node.add_child(guess.metadata().span) + child.guess = guess + else: + node.guess = guess + log_found_guess(guess, logger) + return node.guess + + +def log_found_guess(guess, logger=None): + for k, v in guess.items(): + (logger or log).debug('Property found: %s=%s (%s) (confidence=%.2f)' % + (k, v, guess.raw(k), guess.confidence(k))) + + +def _get_split_spans(node, span): + partition_spans = node.get_partition_spans(span) + for to_remove_span in partition_spans: + if to_remove_span[0] == span[0] and to_remove_span[1] in [span[1], span[1] + 1]: + partition_spans.remove(to_remove_span) + break + return partition_spans + + +class GuessFinder(object): + def __init__(self, guess_func, confidence=None, logger=None, options=None): + self.guess_func = guess_func + self.confidence = confidence + self.logger = logger or log + self.options = options + + def process_nodes(self, nodes): + for node in nodes: + self.process_node(node) + + def process_node(self, node, iterative=True, partial_span=None): + if partial_span: + value = node.value[partial_span[0]:partial_span[1]] + else: + value = node.value + string = ' %s ' % value # add sentinels + + if not self.options: + matcher_result = self.guess_func(string, node) + else: + matcher_result = self.guess_func(string, node, self.options) + + if matcher_result: + if not isinstance(matcher_result, Guess): + result, span = matcher_result + else: + result, span = matcher_result, matcher_result.metadata().span + + if result: + # readjust span to compensate for sentinels + span = (span[0] - 1, span[1] - 1) + + # readjust span to compensate for partial_span + if partial_span: + span = (span[0] + partial_span[0], span[1] + partial_span[0]) + + partition_spans = None + if self.options and 'skip_nodes' in self.options: + skip_nodes = self.options.get('skip_nodes') + for skip_node in skip_nodes: + if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\ + skip_node.span == span or\ + skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset): + if partition_spans is None: + partition_spans = _get_split_spans(node, skip_node.span) + else: + new_partition_spans = [] + for partition_span in partition_spans: + tmp_node = MatchTree(value, span=partition_span, parent=node) + tmp_partitions_spans = _get_split_spans(tmp_node, skip_node.span) + new_partition_spans.extend(tmp_partitions_spans) + partition_spans.extend(new_partition_spans) + + if not partition_spans: + # restore sentinels compensation + + if isinstance(result, Guess): + guess = result + else: + guess = Guess(result, confidence=self.confidence, input=string, span=span) + + if not iterative: + found_guess(node, guess, logger=self.logger) + else: + absolute_span = (span[0] + node.offset, span[1] + node.offset) + node.partition(span) + if node.is_leaf(): + found_guess(node, guess, logger=self.logger) + else: + found_child = None + for child in node.children: + if child.span == absolute_span: + found_guess(child, guess, logger=self.logger) + found_child = child + break + for child in node.children: + if child is not found_child: + self.process_node(child) + else: + for partition_span in partition_spans: + self.process_node(node, partial_span=partition_span) diff --git a/libs/guessit/matchtree.py b/libs/guessit/matchtree.py index 0725e835..19c1e759 100644 --- a/libs/guessit/matchtree.py +++ b/libs/guessit/matchtree.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,12 +18,15 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import UnicodeMixin, base_text_type, Guess -from guessit.textutils import clean_string, str_fill +from __future__ import absolute_import, division, print_function, unicode_literals + +import guessit # @UnusedImport needed for doctests +from guessit import UnicodeMixin, base_text_type +from guessit.textutils import clean_default, str_fill from guessit.patterns import group_delimiters -from guessit.guess import (merge_similar_guesses, merge_all, - choose_int, choose_string) +from guessit.guess import (merge_similar_guesses, smart_merge, + choose_int, choose_string, Guess) +from itertools import takewhile import copy import logging @@ -31,23 +34,71 @@ log = logging.getLogger(__name__) class BaseMatchTree(UnicodeMixin): - """A MatchTree represents the hierarchical split of a string into its - constituent semantic groups.""" + """A BaseMatchTree is a tree covering the filename, where each + node represents a substring in the filename and can have a ``Guess`` + associated with it that contains the information that has been guessed + in this node. Nodes can be further split into subnodes until a proper + split has been found. - def __init__(self, string='', span=None, parent=None): + Each node has the following attributes: + - string = the original string of which this node represents a region + - span = a pair of (begin, end) indices delimiting the substring + - parent = parent node + - children = list of children nodes + - guess = Guess() + + BaseMatchTrees are displayed in the following way: + + >>> path = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv' + >>> print(guessit.IterativeMatcher(path).match_tree) + 000000 1111111111111111 2222222222222222222222222222222222222222222 333 + 000000 0000000000111111 0000000000111111222222222222222222222222222 000 + 011112 011112000011111222222222222222222 000 + 011112222222222222 + 0000011112222 + 01112 0111 + Movies/__________(____)/Dark.City.(____).DC._____.____.___.____-___.___ + tttttttttt yyyy yyyy fffff ssss aaa vvvv rrr ccc + Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv + + The last line contains the filename, which you can use a reference. + The previous line contains the type of property that has been found. + The line before that contains the filename, where all the found groups + have been blanked. Basically, what is left on this line are the leftover + groups which could not be identified. + + The lines before that indicate the indices of the groups in the tree. + + For instance, the part of the filename 'BDRip' is the leaf with index + ``(2, 2, 1)`` (read from top to bottom), and its meaning is 'format' + (as shown by the ``f``'s on the last-but-one line). + """ + + def __init__(self, string='', span=None, parent=None, clean_function=None): self.string = string self.span = span or (0, len(string)) self.parent = parent self.children = [] self.guess = Guess() + self._clean_value = None + self._clean_function = clean_function or clean_default @property def value(self): + """Return the substring that this node matches.""" return self.string[self.span[0]:self.span[1]] @property def clean_value(self): - return clean_string(self.value) + """Return a cleaned value of the matched substring, with better + presentation formatting (punctuation marks removed, duplicate + spaces, ...)""" + if self._clean_value is None: + self._clean_value = self.clean_string(self.value) + return self._clean_value + + def clean_string(self, string): + return self._clean_function(string) @property def offset(self): @@ -55,6 +106,8 @@ class BaseMatchTree(UnicodeMixin): @property def info(self): + """Return a dict containing all the info guessed by this node, + subnodes included.""" result = dict(self.guess) for c in self.children: @@ -64,6 +117,7 @@ class BaseMatchTree(UnicodeMixin): @property def root(self): + """Return the root node of the tree.""" if not self.parent: return self @@ -71,28 +125,43 @@ class BaseMatchTree(UnicodeMixin): @property def depth(self): + """Return the depth of this node.""" if self.is_leaf(): return 0 return 1 + max(c.depth for c in self.children) def is_leaf(self): + """Return whether this node is a leaf or not.""" return self.children == [] def add_child(self, span): - child = MatchTree(self.string, span=span, parent=self) + """Add a new child node to this node with the given span.""" + child = MatchTree(self.string, span=span, parent=self, clean_function=self._clean_function) self.children.append(child) + return child - def partition(self, indices): + def get_partition_spans(self, indices): + """Return the list of absolute spans for the regions of the original + string defined by splitting this node at the given indices (relative + to this node)""" indices = sorted(indices) if indices[0] != 0: indices.insert(0, 0) if indices[-1] != len(self.value): indices.append(len(self.value)) + spans = [] for start, end in zip(indices[:-1], indices[1:]): - self.add_child(span=(self.offset + start, - self.offset + end)) + spans.append((self.offset + start, + self.offset + end)) + return spans + + def partition(self, indices): + """Partition this node by splitting it at the given indices, + relative to this node.""" + for partition_span in self.get_partition_spans(indices): + self.add_child(span=partition_span) def split_on_components(self, components): offset = 0 @@ -104,6 +173,7 @@ class BaseMatchTree(UnicodeMixin): offset = end def nodes_at_depth(self, depth): + """Return all the nodes at a given depth in the tree""" if depth == 0: yield self @@ -113,38 +183,109 @@ class BaseMatchTree(UnicodeMixin): @property def node_idx(self): + """Return this node's index in the tree, as a tuple. + If this node is the root of the tree, then return ().""" if self.parent is None: return () - return self.parent.node_idx + (self.parent.children.index(self),) + return self.parent.node_idx + (self.node_last_idx,) + + @property + def node_last_idx(self): + if self.parent is None: + return None + return self.parent.children.index(self) def node_at(self, idx): + """Return the node at the given index in the subtree rooted at + this node.""" if not idx: return self try: return self.children[idx[0]].node_at(idx[1:]) - except: + except IndexError: raise ValueError('Non-existent node index: %s' % (idx,)) def nodes(self): + """Return all the nodes and subnodes in this tree.""" yield self for child in self.children: for node in child.nodes(): yield node - def _leaves(self): + def leaves(self): + """Return a generator over all the nodes that are leaves.""" if self.is_leaf(): yield self else: for child in self.children: # pylint: disable=W0212 - for leaf in child._leaves(): + for leaf in child.leaves(): yield leaf - def leaves(self): - return list(self._leaves()) + def group_node(self): + return self._other_group_node(0) + + def previous_group_node(self): + return self._other_group_node(-1) + + def next_group_node(self): + return self._other_group_node(+1) + + def _other_group_node(self, offset): + if len(self.node_idx) > 1: + group_idx = self.node_idx[:2] + if group_idx[1] + offset >= 0: + other_group_idx = (group_idx[0], group_idx[1] + offset) + try: + other_group_node = self.root.node_at(other_group_idx) + return other_group_node + except ValueError: + pass + return None + + def previous_leaf(self, leaf): + """Return previous leaf for this node""" + return self._other_leaf(leaf, -1) + + def next_leaf(self, leaf): + """Return next leaf for this node""" + return self._other_leaf(leaf, +1) + + def _other_leaf(self, leaf, offset): + leaves = list(self.leaves()) + index = leaves.index(leaf) + offset + if index > 0 and index < len(leaves): + return leaves[index] + return None + + def previous_leaves(self, leaf): + """Return previous leaves for this node""" + leaves = list(self.leaves()) + index = leaves.index(leaf) + if index > 0 and index < len(leaves): + previous_leaves = leaves[:index] + previous_leaves.reverse() + return previous_leaves + return [] + + def next_leaves(self, leaf): + """Return next leaves for this node""" + leaves = list(self.leaves()) + index = leaves.index(leaf) + if index > 0 and index < len(leaves): + return leaves[index + 1:len(leaves)] + return [] def to_string(self): + """Return a readable string representation of this tree. + + The result is a multi-line string, where the lines are: + - line 1 -> N-2: each line contains the nodes at the given depth in the tree + - line N-2: original string where all the found groups have been blanked + - line N-1: type of property that has been found + - line N: the original string, which you can use a reference. + """ empty_line = ' ' * len(self.string) def to_hex(x): @@ -153,23 +294,27 @@ class BaseMatchTree(UnicodeMixin): return x def meaning(result): - mmap = { 'episodeNumber': 'E', - 'season': 'S', - 'extension': 'e', - 'format': 'f', - 'language': 'l', - 'country': 'C', - 'videoCodec': 'v', - 'audioCodec': 'a', - 'website': 'w', - 'container': 'c', - 'series': 'T', - 'title': 't', - 'date': 'd', - 'year': 'y', - 'releaseGroup': 'r', - 'screenSize': 's' - } + mmap = {'episodeNumber': 'E', + 'season': 'S', + 'extension': 'e', + 'format': 'f', + 'language': 'l', + 'country': 'C', + 'videoCodec': 'v', + 'videoProfile': 'v', + 'audioCodec': 'a', + 'audioProfile': 'a', + 'audioChannels': 'a', + 'website': 'w', + 'container': 'c', + 'series': 'T', + 'title': 't', + 'date': 'd', + 'year': 'y', + 'releaseGroup': 'r', + 'screenSize': 's', + 'other': 'o' + } if result is None: return ' ' @@ -180,7 +325,7 @@ class BaseMatchTree(UnicodeMixin): return 'x' - lines = [ empty_line ] * (self.depth + 2) # +2: remaining, meaning + lines = [empty_line] * (self.depth + 2) # +2: remaining, meaning lines[-2] = self.string for node in self.nodes(): @@ -198,63 +343,61 @@ class BaseMatchTree(UnicodeMixin): lines.append(self.string) - return '\n'.join(lines) + return '\n'.join(l.rstrip() for l in lines) def __unicode__(self): return self.to_string() + def __repr__(self): + return '' % self.value + class MatchTree(BaseMatchTree): """The MatchTree contains a few "utility" methods which are not necessary for the BaseMatchTree, but add a lot of convenience for writing - higher-level rules.""" + higher-level rules. + """ - def _unidentified_leaves(self, - valid=lambda leaf: len(leaf.clean_value) >= 2): - for leaf in self._leaves(): + def unidentified_leaves(self, + valid=lambda leaf: len(leaf.clean_value) > 0): + """Return a generator of leaves that are not empty.""" + for leaf in self.leaves(): if not leaf.guess and valid(leaf): yield leaf - def unidentified_leaves(self, - valid=lambda leaf: len(leaf.clean_value) >= 2): - return list(self._unidentified_leaves(valid)) - - def _leaves_containing(self, property_name): + def leaves_containing(self, property_name): + """Return a generator of leaves that guessed the given property.""" if isinstance(property_name, base_text_type): - property_name = [ property_name ] + property_name = [property_name] - for leaf in self._leaves(): + for leaf in self.leaves(): for prop in property_name: if prop in leaf.guess: yield leaf break - def leaves_containing(self, property_name): - return list(self._leaves_containing(property_name)) - def first_leaf_containing(self, property_name): + """Return the first leaf containing the given property.""" try: - return next(self._leaves_containing(property_name)) + return next(self.leaves_containing(property_name)) except StopIteration: return None - def _previous_unidentified_leaves(self, node): - node_idx = node.node_idx - for leaf in self._unidentified_leaves(): - if leaf.node_idx < node_idx: - yield leaf - def previous_unidentified_leaves(self, node): - return list(self._previous_unidentified_leaves(node)) - - def _previous_leaves_containing(self, node, property_name): + """Return a generator of non-empty leaves that are before the given + node (in the string).""" node_idx = node.node_idx - for leaf in self._leaves_containing(property_name): + for leaf in self.unidentified_leaves(): if leaf.node_idx < node_idx: yield leaf def previous_leaves_containing(self, node, property_name): - return list(self._previous_leaves_containing(node, property_name)) + """Return a generator of leaves containing the given property that are + before the given node (in the string).""" + node_idx = node.node_idx + for leaf in self.leaves_containing(property_name): + if leaf.node_idx < node_idx: + yield leaf def is_explicit(self): """Return whether the group was explicitly enclosed by @@ -262,26 +405,22 @@ class MatchTree(BaseMatchTree): return (self.value[0] + self.value[-1]) in group_delimiters def matched(self): - # we need to make a copy here, as the merge functions work in place and - # calling them on the match tree would modify it - parts = [node.guess for node in self.nodes() if node.guess] - parts = copy.deepcopy(parts) + """Return a single guess that contains all the info found in the + nodes of this tree, trying to merge properties as good as possible. + """ + if not getattr(self, '_matched_result', None): + # we need to make a copy here, as the merge functions work in place and + # calling them on the match tree would modify it + parts = [copy.copy(node.guess) for node in self.nodes() if node.guess] - # 1- try to merge similar information together and give it a higher - # confidence - for int_part in ('year', 'season', 'episodeNumber'): - merge_similar_guesses(parts, int_part, choose_int) + result = smart_merge(parts) - for string_part in ('title', 'series', 'container', 'format', - 'releaseGroup', 'website', 'audioCodec', - 'videoCodec', 'screenSize', 'episodeFormat', - 'audioChannels', 'idNumber'): - merge_similar_guesses(parts, string_part, choose_string) + log.debug('Final result: ' + result.nice_string()) + self._matched_result = result - # 2- merge the rest, potentially discarding information not properly - # merged before - result = merge_all(parts, - append=['language', 'subtitleLanguage', 'other']) + for unidentified_leaves in self.unidentified_leaves(): + if 'unidentified' not in self._matched_result: + self._matched_result['unidentified'] = [] + self._matched_result['unidentified'].append(unidentified_leaves.clean_value) - log.debug('Final result: ' + result.nice_string()) - return result + return self._matched_result diff --git a/libs/guessit/options.py b/libs/guessit/options.py new file mode 100644 index 00000000..9b8dc0fb --- /dev/null +++ b/libs/guessit/options.py @@ -0,0 +1,69 @@ +from argparse import ArgumentParser + + +def build_opts(transformers=None): + opts = ArgumentParser() + opts.add_argument(dest='filename', help='Filename or release name to guess', nargs='*') + + naming_opts = opts.add_argument_group("Naming") + naming_opts.add_argument('-t', '--type', dest='type', default=None, + help='The suggested file type: movie, episode. If undefined, type will be guessed.') + naming_opts.add_argument('-n', '--name-only', dest='name_only', action='store_true', default=False, + help='Parse files as name only. Disable folder parsing, extension parsing, and file content analysis.') + naming_opts.add_argument('-c', '--split-camel', dest='split_camel', action='store_true', default=False, + help='Split camel case part of filename.') + + naming_opts.add_argument('-X', '--disabled-transformer', action='append', dest='disabled_transformers', + help='Transformer to disable (can be used multiple time)') + + output_opts = opts.add_argument_group("Output") + output_opts.add_argument('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='Display debug output') + output_opts.add_argument('-P', '--show-property', dest='show_property', default=None, + help='Display the value of a single property (title, series, videoCodec, year, type ...)'), + output_opts.add_argument('-u', '--unidentified', dest='unidentified', action='store_true', default=False, + help='Display the unidentified parts.'), + output_opts.add_argument('-a', '--advanced', dest='advanced', action='store_true', default=False, + help='Display advanced information for filename guesses, as json output') + output_opts.add_argument('-y', '--yaml', dest='yaml', action='store_true', default=False, + help='Display information for filename guesses as yaml output (like unit-test)') + output_opts.add_argument('-f', '--input-file', dest='input_file', default=False, + help='Read filenames from an input file.') + output_opts.add_argument('-d', '--demo', action='store_true', dest='demo', default=False, + help='Run a few builtin tests instead of analyzing a file') + + information_opts = opts.add_argument_group("Information") + information_opts.add_argument('-p', '--properties', dest='properties', action='store_true', default=False, + help='Display properties that can be guessed.') + information_opts.add_argument('-V', '--values', dest='values', action='store_true', default=False, + help='Display property values that can be guessed.') + information_opts.add_argument('-s', '--transformers', dest='transformers', action='store_true', default=False, + help='Display transformers that can be used.') + information_opts.add_argument('--version', dest='version', action='store_true', default=False, + help='Display the guessit version.') + + webservice_opts = opts.add_argument_group("guessit.io") + webservice_opts.add_argument('-b', '--bug', action='store_true', dest='submit_bug', default=False, + help='Submit a wrong detection to the guessit.io service') + + other_opts = opts.add_argument_group("Other features") + other_opts.add_argument('-i', '--info', dest='info', default='filename', + help='The desired information type: filename, video, hash_mpc or a hash from python\'s ' + 'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of ' + 'them, comma-separated') + + if transformers: + for transformer in transformers: + transformer.register_arguments(opts, naming_opts, output_opts, information_opts, webservice_opts, other_opts) + + return opts, naming_opts, output_opts, information_opts, webservice_opts, other_opts +_opts, _naming_opts, _output_opts, _information_opts, _webservice_opts, _other_opts = None, None, None, None, None, None + + +def reload(transformers=None): + global _opts, _naming_opts, _output_opts, _information_opts, _webservice_opts, _other_opts + _opts, _naming_opts, _output_opts, _information_opts, _webservice_opts, _other_opts = build_opts(transformers) + + +def get_opts(): + return _opts diff --git a/libs/guessit/patterns.py b/libs/guessit/patterns.py deleted file mode 100644 index f803a11c..00000000 --- a/libs/guessit/patterns.py +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2011 Nicolas Wack -# Copyright (c) 2011 Ricard Marxer -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import unicode_literals -import re - - -subtitle_exts = [ 'srt', 'idx', 'sub', 'ssa' ] - -info_exts = [ 'nfo' ] - -video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', - 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', - 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv'] - -group_delimiters = [ '()', '[]', '{}' ] - -# separator character regexp -sep = r'[][,)(}{+ /\._-]' # regexp art, hehe :D - -# character used to represent a deleted char (when matching groups) -deleted = '_' - -# format: [ (regexp, confidence, span_adjust) ] -episode_rexps = [ # ... Season 2 ... - (r'season (?P[0-9]+)', 1.0, (0, 0)), - (r'saison (?P[0-9]+)', 1.0, (0, 0)), - - # ... s02e13 ... - (r'[Ss](?P[0-9]{1,3})[^0-9]?(?P(?:-?[eE-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)), - - # ... s03-x02 ... # FIXME: redundant? remove it? - #(r'[Ss](?P[0-9]{1,3})[^0-9]?(?P(?:-?[xX-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)), - - # ... 2x13 ... - (r'[^0-9](?P[0-9]{1,2})[^0-9 .-]?(?P(?:-?[xX][0-9]{1,3})+)[^0-9]', 1.0, (1, -1)), - - # ... s02 ... - #(sep + r's(?P[0-9]{1,2})' + sep, 0.6, (1, -1)), - (r's(?P[0-9]{1,2})[^0-9]', 0.6, (0, -1)), - - # v2 or v3 for some mangas which have multiples rips - (r'(?P[0-9]{1,3})v[23]' + sep, 0.6, (0, 0)), - - # ... ep 23 ... - ('ep' + sep + r'(?P[0-9]{1,2})[^0-9]', 0.7, (0, -1)), - - # ... e13 ... for a mini-series without a season number - (sep + r'e(?P[0-9]{1,2})' + sep, 0.6, (1, -1)) - - ] - - -weak_episode_rexps = [ # ... 213 or 0106 ... - (sep + r'(?P[0-9]{2,4})' + sep, (1, -1)) - ] - -non_episode_title = [ 'extras', 'rip' ] - - -video_rexps = [ # cd number - (r'cd ?(?P[0-9])( ?of ?(?P[0-9]))?', 1.0, (0, 0)), - (r'(?P[1-9]) cds?', 0.9, (0, 0)), - - # special editions - (r'edition' + sep + r'(?Pcollector)', 1.0, (0, 0)), - (r'(?Pcollector)' + sep + 'edition', 1.0, (0, 0)), - (r'(?Pspecial)' + sep + 'edition', 1.0, (0, 0)), - (r'(?Pcriterion)' + sep + 'edition', 1.0, (0, 0)), - - # director's cut - (r"(?Pdirector'?s?" + sep + "cut)", 1.0, (0, 0)), - - # video size - (r'(?P[0-9]{3,4})x(?P[0-9]{3,4})', 0.9, (0, 0)), - - # website - (r'(?Pwww(\.[a-zA-Z0-9]+){2,3})', 0.8, (0, 0)), - - # bonusNumber: ... x01 ... - (r'x(?P[0-9]{1,2})', 1.0, (0, 0)), - - # filmNumber: ... f01 ... - (r'f(?P[0-9]{1,2})', 1.0, (0, 0)) - ] - -websites = [ 'tvu.org.ru', 'emule-island.com', 'UsaBit.com', 'www.divx-overnet.com', - 'sharethefiles.com' ] - -unlikely_series = [ 'series' ] - - -# prop_multi is a dict of { property_name: { canonical_form: [ pattern ] } } -# pattern is a string considered as a regexp, with the addition that dashes are -# replaced with '([ \.-_])?' which matches more types of separators (or none) -# note: simpler patterns need to be at the end of the list to not shadow more -# complete ones, eg: 'AAC' needs to come after 'He-AAC' -# ie: from most specific to less specific -prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ], - 'HD-DVD': [ 'HD-(?:DVD)?-Rip', 'HD-DVD' ], - 'BluRay': [ 'Blu-ray', 'B[DR]Rip' ], - 'HDTV': [ 'HD-TV' ], - 'DVB': [ 'DVB-Rip', 'DVB', 'PD-TV' ], - 'WEBRip': [ 'WEB-Rip' ], - 'Screener': [ 'DVD-SCR', 'Screener' ], - 'VHS': [ 'VHS' ], - 'WEB-DL': [ 'WEB-DL' ] }, - - 'is3D': { True: [ '3D' ] }, - - 'screenSize': { '480p': [ '480[pi]?' ], - '720p': [ '720[pi]?' ], - '1080i': [ '1080i' ], - '1080p': [ '1080p', '1080[^i]' ] }, - - 'videoCodec': { 'XviD': [ 'Xvid' ], - 'DivX': [ 'DVDivX', 'DivX' ], - 'h264': [ '[hx]-264' ], - 'Rv10': [ 'Rv10' ], - 'Mpeg2': [ 'Mpeg2' ] }, - - # has nothing to do here (or on filenames for that matter), but some - # releases use it and it helps to identify release groups, so we adapt - 'videoApi': { 'DXVA': [ 'DXVA' ] }, - - 'audioCodec': { 'AC3': [ 'AC3' ], - 'DTS': [ 'DTS' ], - 'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] }, - - 'audioChannels': { '5.1': [ r'5\.1', 'DD5[._ ]1', '5ch' ] }, - - 'episodeFormat': { 'Minisode': [ 'Minisodes?' ] } - - } - -# prop_single dict of { property_name: [ canonical_form ] } -prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA', - 'CHD', 'ViTE', 'TLF', 'FLAiTE', - 'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', - 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL', - 'CtrlHD', 'POD', 'WiKi','IMMERSE', 'FQM', - '2HD', 'CTU', 'HALCYON', 'EbP', 'SiTV', - 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV', - 'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3', - 'TrollHD', 'ECI' - ], - - # potentially confusing release group names (they are words) - 'weakReleaseGroup': [ 'DEiTY', 'FiNaLe', 'UnSeeN', 'KiNGS', 'CLUE', 'DIMENSION', - 'SAiNTS', 'ARROW', 'EuReKA', 'SiNNERS', 'DiRTY', 'REWARD', - 'REPTiLE', - ], - - 'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5', - 'complete', 'classic', # not so sure about these ones, could appear in a title - 'ws' ] # widescreen - } - -_dash = '-' -_psep = '[-. _]?' - -def _to_rexp(prop): - return re.compile(prop.replace(_dash, _psep), re.IGNORECASE) - -# properties_rexps dict of { property_name: { canonical_form: [ rexp ] } } -# containing the rexps compiled from both prop_multi and prop_single -properties_rexps = dict((type, dict((canonical_form, - [ _to_rexp(pattern) for pattern in patterns ]) - for canonical_form, patterns in props.items())) - for type, props in prop_multi.items()) - -properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_form) ]) - for canonical_form in props)) - for type, props in prop_single.items())) - - - -def find_properties(string): - result = [] - for property_name, props in properties_rexps.items(): - # FIXME: this should be done in a more flexible way... - if property_name in ['weakReleaseGroup']: - continue - - for canonical_form, rexps in props.items(): - for value_rexp in rexps: - match = value_rexp.search(string) - if match: - start, end = match.span() - # make sure our word is always surrounded by separators - # note: sep is a regexp, but in this case using it as - # a char sequence achieves the same goal - if ((start > 0 and string[start-1] not in sep) or - (end < len(string) and string[end] not in sep)): - continue - - result.append((property_name, canonical_form, start, end)) - return result - - -property_synonyms = { 'Special Edition': [ 'Special' ], - 'Collector Edition': [ 'Collector' ], - 'Criterion Edition': [ 'Criterion' ] - } - - -def revert_synonyms(): - reverse = {} - - for canonical, synonyms in property_synonyms.items(): - for synonym in synonyms: - reverse[synonym.lower()] = canonical - - return reverse - - -reverse_synonyms = revert_synonyms() - - -def canonical_form(string): - return reverse_synonyms.get(string.lower(), string) - - -def compute_canonical_form(property_name, value): - """Return the canonical form of a property given its type if it is a valid - one, None otherwise.""" - if isinstance(value, basestring): - for canonical_form, rexps in properties_rexps[property_name].items(): - for rexp in rexps: - if rexp.match(value): - return canonical_form - return None diff --git a/libs/guessit/patterns/__init__.py b/libs/guessit/patterns/__init__.py new file mode 100755 index 00000000..1816d494 --- /dev/null +++ b/libs/guessit/patterns/__init__.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2013 Rémi Alvergnat +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +import re + +from guessit import base_text_type + +group_delimiters = ['()', '[]', '{}'] + +# separator character regexp +sep = r'[][,)(}:{+ /~/\._-]' # regexp art, hehe :D + +_dash = '-' +_psep = '[\W_]?' + + +def build_or_pattern(patterns, escape=False): + """Build a or pattern string from a list of possible patterns + """ + or_pattern = [] + for pattern in patterns: + if not or_pattern: + or_pattern.append('(?:') + else: + or_pattern.append('|') + or_pattern.append('(?:%s)' % re.escape(pattern) if escape else pattern) + or_pattern.append(')') + return ''.join(or_pattern) + + +def compile_pattern(pattern, enhance=True): + """Compile and enhance a pattern + + :param pattern: Pattern to compile (regexp). + :type pattern: string + + :param pattern: Enhance pattern before compiling. + :type pattern: string + + :return: The compiled pattern + :rtype: regular expression object + """ + return re.compile(enhance_pattern(pattern) if enhance else pattern, re.IGNORECASE) + + +def enhance_pattern(pattern): + """Enhance pattern to match more equivalent values. + + '-' are replaced by '[\W_]?', which matches more types of separators (or none) + + :param pattern: Pattern to enhance (regexp). + :type pattern: string + + :return: The enhanced pattern + :rtype: string + """ + return pattern.replace(_dash, _psep) diff --git a/libs/guessit/patterns/extension.py b/libs/guessit/patterns/extension.py new file mode 100644 index 00000000..40a576b6 --- /dev/null +++ b/libs/guessit/patterns/extension.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2013 Rémi Alvergnat +# Copyright (c) 2011 Ricard Marxer +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +subtitle_exts = ['srt', 'idx', 'sub', 'ssa', 'ass'] + +info_exts = ['nfo'] + +video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', + 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', + 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv', + 'iso'] diff --git a/libs/guessit/patterns/numeral.py b/libs/guessit/patterns/numeral.py new file mode 100644 index 00000000..f254c6b8 --- /dev/null +++ b/libs/guessit/patterns/numeral.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Rémi Alvergnat +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +import re + +digital_numeral = '\d{1,4}' + +roman_numeral = "(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})" + +english_word_numeral_list = [ + 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', + 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty' +] + +french_word_numeral_list = [ + 'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', + 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt' +] + +french_alt_word_numeral_list = [ + 'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', + 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt' +] + + +def __build_word_numeral(*args, **kwargs): + re_ = None + for word_list in args: + for word in word_list: + if not re_: + re_ = '(?:(?=\w+)' + else: + re_ += '|' + re_ += word + re_ += ')' + return re_ + +word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list) + +numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')' + +__romanNumeralMap = ( + ('M', 1000), + ('CM', 900), + ('D', 500), + ('CD', 400), + ('C', 100), + ('XC', 90), + ('L', 50), + ('XL', 40), + ('X', 10), + ('IX', 9), + ('V', 5), + ('IV', 4), + ('I', 1) + ) + +__romanNumeralPattern = re.compile('^' + roman_numeral + '$') + + +def __parse_roman(value): + """convert Roman numeral to integer""" + if not __romanNumeralPattern.search(value): + raise ValueError('Invalid Roman numeral: %s' % value) + + result = 0 + index = 0 + for num, integer in __romanNumeralMap: + while value[index:index + len(num)] == num: + result += integer + index += len(num) + return result + + +def __parse_word(value): + """Convert Word numeral to integer""" + for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]: + try: + return word_list.index(value.lower()) + except ValueError: + pass + raise ValueError + + +_clean_re = re.compile('[^\d]*(\d+)[^\d]*') + + +def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True): + """Parse a numeric value into integer. + + input can be an integer as a string, a roman numeral or a word + + :param value: Value to parse. Can be an integer, roman numeral or word. + :type value: string + + :return: Numeric value, or None if value can't be parsed + :rtype: int + """ + if int_enabled: + try: + if clean: + match = _clean_re.match(value) + if match: + clean_value = match.group(1) + return int(clean_value) + return int(value) + except ValueError: + pass + if roman_enabled: + try: + if clean: + for word in value.split(): + try: + return __parse_roman(word.upper()) + except ValueError: + pass + return __parse_roman(value) + except ValueError: + pass + if word_enabled: + try: + if clean: + for word in value.split(): + try: + return __parse_word(word) + except ValueError: + pass + return __parse_word(value) + except ValueError: + pass + raise ValueError('Invalid numeral: ' + value) diff --git a/libs/guessit/plugins/__init__.py b/libs/guessit/plugins/__init__.py new file mode 100644 index 00000000..6a63e4e1 --- /dev/null +++ b/libs/guessit/plugins/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals diff --git a/libs/guessit/plugins/transformers.py b/libs/guessit/plugins/transformers.py new file mode 100644 index 00000000..f2f746c0 --- /dev/null +++ b/libs/guessit/plugins/transformers.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals +from guessit.options import reload as reload_options + +from stevedore import ExtensionManager +from pkg_resources import EntryPoint + +from stevedore.extension import Extension +from logging import getLogger + +log = getLogger(__name__) + + +class Transformer(object): # pragma: no cover + def __init__(self, priority=0): + self.priority = priority + self.log = getLogger(self.name) + + @property + def name(self): + return self.__class__.__name__ + + def supported_properties(self): + return {} + + def second_pass_options(self, mtree, options=None): + return None + + def should_process(self, mtree, options=None): + return True + + def process(self, mtree, options=None): + pass + + def post_process(self, mtree, options=None): + pass + + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + pass + + def rate_quality(self, guess, *props): + return 0 + + +class CustomTransformerExtensionManager(ExtensionManager): + def __init__(self, namespace='guessit.transformer', invoke_on_load=True, + invoke_args=(), invoke_kwds={}, propagate_map_exceptions=True, on_load_failure_callback=None, + verify_requirements=False): + super(CustomTransformerExtensionManager, self).__init__(namespace=namespace, + invoke_on_load=invoke_on_load, + invoke_args=invoke_args, + invoke_kwds=invoke_kwds, + propagate_map_exceptions=propagate_map_exceptions, + on_load_failure_callback=on_load_failure_callback, + verify_requirements=verify_requirements) + + def order_extensions(self, extensions): + """Order the loaded transformers + + It should follow those rules + - website before language (eg: tvu.org.ru vs russian) + - language before episodes_rexps + - properties before language (eg: he-aac vs hebrew) + - release_group before properties (eg: XviD-?? vs xvid) + """ + extensions.sort(key=lambda ext: -ext.obj.priority) + return extensions + + def _load_one_plugin(self, ep, invoke_on_load, invoke_args, invoke_kwds, verify_requirements=True): + if not ep.dist: + # `require` argument of ep.load() is deprecated in newer versions of setuptools + if hasattr(ep, 'resolve'): + plugin = ep.resolve() + elif hasattr(ep, '_load'): + plugin = ep._load() + else: + plugin = ep.load(require=False) + else: + plugin = ep.load() + if invoke_on_load: + obj = plugin(*invoke_args, **invoke_kwds) + else: + obj = None + return Extension(ep.name, ep, plugin, obj) + + def _load_plugins(self, invoke_on_load, invoke_args, invoke_kwds, verify_requirements): + return self.order_extensions(super(CustomTransformerExtensionManager, self)._load_plugins(invoke_on_load, invoke_args, invoke_kwds, verify_requirements)) + + def objects(self): + return self.map(self._get_obj) + + def _get_obj(self, ext): + return ext.obj + + def object(self, name): + try: + return self[name].obj + except KeyError: + return None + + def register_module(self, name=None, module_name=None, attrs=(), entry_point=None): + if entry_point: + ep = EntryPoint.parse(entry_point) + else: + ep = EntryPoint(name, module_name, attrs) + loaded = self._load_one_plugin(ep, invoke_on_load=True, invoke_args=(), invoke_kwds={}) + if loaded: + self.extensions.append(loaded) + self.extensions = self.order_extensions(self.extensions) + self._extensions_by_name = None + + +class DefaultTransformerExtensionManager(CustomTransformerExtensionManager): + @property + def _internal_entry_points(self): + return ['split_path_components = guessit.transfo.split_path_components:SplitPathComponents', + 'guess_filetype = guessit.transfo.guess_filetype:GuessFiletype', + 'split_explicit_groups = guessit.transfo.split_explicit_groups:SplitExplicitGroups', + 'guess_date = guessit.transfo.guess_date:GuessDate', + 'guess_website = guessit.transfo.guess_website:GuessWebsite', + 'guess_release_group = guessit.transfo.guess_release_group:GuessReleaseGroup', + 'guess_properties = guessit.transfo.guess_properties:GuessProperties', + 'guess_language = guessit.transfo.guess_language:GuessLanguage', + 'guess_video_rexps = guessit.transfo.guess_video_rexps:GuessVideoRexps', + 'guess_episodes_rexps = guessit.transfo.guess_episodes_rexps:GuessEpisodesRexps', + 'guess_weak_episodes_rexps = guessit.transfo.guess_weak_episodes_rexps:GuessWeakEpisodesRexps', + 'guess_bonus_features = guessit.transfo.guess_bonus_features:GuessBonusFeatures', + 'guess_year = guessit.transfo.guess_year:GuessYear', + 'guess_country = guessit.transfo.guess_country:GuessCountry', + 'guess_idnumber = guessit.transfo.guess_idnumber:GuessIdnumber', + 'split_on_dash = guessit.transfo.split_on_dash:SplitOnDash', + 'guess_episode_info_from_position = guessit.transfo.guess_episode_info_from_position:GuessEpisodeInfoFromPosition', + 'guess_movie_title_from_position = guessit.transfo.guess_movie_title_from_position:GuessMovieTitleFromPosition', + 'guess_episode_details = guessit.transfo.guess_episode_details:GuessEpisodeDetails', + 'expected_series = guessit.transfo.expected_series:ExpectedSeries', + 'expected_title = guessit.transfo.expected_title:ExpectedTitle',] + + def _find_entry_points(self, namespace): + entry_points = {} + # Internal entry points + if namespace == self.namespace: + for internal_entry_point_str in self._internal_entry_points: + internal_entry_point = EntryPoint.parse(internal_entry_point_str) + entry_points[internal_entry_point.name] = internal_entry_point + + # Package entry points + setuptools_entrypoints = super(DefaultTransformerExtensionManager, self)._find_entry_points(namespace) + for setuptools_entrypoint in setuptools_entrypoints: + entry_points[setuptools_entrypoint.name] = setuptools_entrypoint + + return list(entry_points.values()) + +_extensions = None + + +def all_transformers(): + return _extensions.objects() + + +def get_transformer(name): + return _extensions.object(name) + + +def add_transformer(name, module_name, class_name): + """ + Add a transformer + + :param name: the name of the transformer. ie: 'guess_regexp_id' + :param name: the module name. ie: 'flexget.utils.parsers.transformers.guess_regexp_id' + :param class_name: the class name. ie: 'GuessRegexpId' + """ + + _extensions.register_module(name, module_name, (class_name,)) + + +def add_transformer(entry_point): + """ + Add a transformer + + :param entry_point: entry point spec format. ie: 'guess_regexp_id = flexget.utils.parsers.transformers.guess_regexp_id:GuessRegexpId' + """ + _extensions.register_module(entry_point = entry_point) + + +def reload(custom=False): + """ + Reload extension manager with default or custom one. + :param custom: if True, custom manager will be used, else default one. + Default manager will load default extensions from guessit and setuptools packaging extensions + Custom manager will not load default extensions from guessit, using only setuptools packaging extensions. + :type custom: boolean + """ + global _extensions + if custom: + _extensions = CustomTransformerExtensionManager() + else: + _extensions = DefaultTransformerExtensionManager() + reload_options(all_transformers()) + +reload() diff --git a/libs/guessit/quality.py b/libs/guessit/quality.py new file mode 100644 index 00000000..870bbdbb --- /dev/null +++ b/libs/guessit/quality.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Rémi Alvergnat +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import all_transformers + + +def best_quality_properties(props, *guesses): + """Retrieve the best quality guess, based on given properties + + :param props: Properties to include in the rating + :type props: list of strings + :param guesses: Guesses to rate + :type guesses: :class:`guessit.guess.Guess` + + :return: Best quality guess from all passed guesses + :rtype: :class:`guessit.guess.Guess` + """ + best_guess = None + best_rate = None + for guess in guesses: + for transformer in all_transformers(): + rate = transformer.rate_quality(guess, *props) + if best_rate is None or best_rate < rate: + best_rate = rate + best_guess = guess + return best_guess + + +def best_quality(*guesses): + """Retrieve the best quality guess. + + :param guesses: Guesses to rate + :type guesses: :class:`guessit.guess.Guess` + + :return: Best quality guess from all passed guesses + :rtype: :class:`guessit.guess.Guess` + """ + best_guess = None + best_rate = None + for guess in guesses: + for transformer in all_transformers(): + rate = transformer.rate_quality(guess) + if best_rate is None or best_rate < rate: + best_rate = rate + best_guess = guess + return best_guess diff --git a/libs/guessit/slogging.py b/libs/guessit/slogging.py index 39591a20..00fb80f7 100644 --- a/libs/guessit/slogging.py +++ b/libs/guessit/slogging.py @@ -1,28 +1,28 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Smewt - A smart collection manager -# Copyright (c) 2011 Nicolas Wack +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack # -# Smewt is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # -# Smewt is distributed in the hope that it will be useful, +# GuessIt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# Lesser GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License +# You should have received a copy of the Lesser GNU General Public License # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + import logging import sys -import os, os.path - +import os GREEN_FONT = "\x1B[0;32m" YELLOW_FONT = "\x1B[0;33m" @@ -31,7 +31,7 @@ RED_FONT = "\x1B[0;31m" RESET_FONT = "\x1B[0m" -def setupLogging(colored=True, with_time=False, with_thread=False, filename=None, with_lineno=False): +def setup_logging(colored=True, with_time=False, with_thread=False, filename=None, with_lineno=False): # pragma: no cover """Set up a nice colored logger as the main application logger.""" class SimpleFormatter(logging.Formatter): diff --git a/libs/guessit/test/1MB b/libs/guessit/test/1MB new file mode 100644 index 00000000..66d50a84 Binary files /dev/null and b/libs/guessit/test/1MB differ diff --git a/libs/guessit/test/__init__.py b/libs/guessit/test/__init__.py new file mode 100644 index 00000000..7ce54945 --- /dev/null +++ b/libs/guessit/test/__init__.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging +from guessit.slogging import setup_logging +setup_logging() +logging.disable(logging.INFO) diff --git a/libs/guessit/test/__main__.py b/libs/guessit/test/__main__.py new file mode 100644 index 00000000..32b8dd10 --- /dev/null +++ b/libs/guessit/test/__main__.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals +from guessit.test import (test_api, test_autodetect, test_autodetect_all, test_doctests, + test_episode, test_hashes, test_language, test_main, + test_matchtree, test_movie, test_quality, test_utils) +from unittest import TextTestRunner + + +import logging + +def main(): + for suite in [test_api.suite, test_autodetect.suite, + test_autodetect_all.suite, test_doctests.suite, + test_episode.suite, test_hashes.suite, test_language.suite, + test_main.suite, test_matchtree.suite, test_movie.suite, + test_quality.suite, test_utils.suite]: + TextTestRunner(verbosity=2).run(suite) + + +if __name__ == '__main__': + main() diff --git a/libs/guessit/test/autodetect.yaml b/libs/guessit/test/autodetect.yaml new file mode 100644 index 00000000..864b8827 --- /dev/null +++ b/libs/guessit/test/autodetect.yaml @@ -0,0 +1,489 @@ +? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv +: type: movie + title: Fear and Loathing in Las Vegas + year: 1998 + screenSize: 720p + format: HD-DVD + audioCodec: DTS + videoCodec: h264 + releaseGroup: ESiR + +? Leopard.dmg +: type: unknown + extension: dmg + +? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi +: type: episode + series: Duckman + season: 1 + episodeNumber: 1 + title: I, Duckman + date: 2002-11-07 + +? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi +: type: episode + series: Neverwhere + episodeNumber: 5 + title: Down Street + website: tvu.org.ru + +? Neverwhere.05.Down.Street.[tvu.org.ru].avi +: type: episode + series: Neverwhere + episodeNumber: 5 + title: Down Street + website: tvu.org.ru + +? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi +: type: episode + series: Breaking Bad + episodeFormat: Minisode + episodeNumber: 1 + title: Good Cop Bad Cop + format: WEBRip + videoCodec: XviD + +? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi +: type: episode + series: Kaamelott + episodeNumber: 23 + title: Le Forfait + +? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv +: type: movie + title: The Doors + year: 1991 + date: 2008-03-09 + format: BluRay + screenSize: 720p + audioCodec: AC3 + videoCodec: h264 + releaseGroup: HiS@SiLUHD + language: english + website: sharethefiles.com + +? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm +: type: movie + title: M.A.S.H. + year: 1970 + videoCodec: DivX + format: DVD + +? the.mentalist.501.hdtv-lol.mp4 +: type: episode + series: The Mentalist + season: 5 + episodeNumber: 1 + format: HDTV + releaseGroup: LOL + +? the.simpsons.2401.hdtv-lol.mp4 +: type: episode + series: The Simpsons + season: 24 + episodeNumber: 1 + format: HDTV + releaseGroup: LOL + +? Homeland.S02E01.HDTV.x264-EVOLVE.mp4 +: type: episode + series: Homeland + season: 2 + episodeNumber: 1 + format: HDTV + videoCodec: h264 + releaseGroup: EVOLVE + +? /media/Band_of_Brothers-e01-Currahee.mkv +: type: episode + series: Band of Brothers + episodeNumber: 1 + title: Currahee + +? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv +: type: episode + series: Band of Brothers + bonusNumber: 2 + bonusTitle: We Stand Alone Together + +? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv +: type: movie + title: Casino Royale + filmSeries: James Bond + filmNumber: 21 + bonusNumber: 2 + bonusTitle: Stunts + +? /TV Shows/new.girl.117.hdtv-lol.mp4 +: type: episode + series: New Girl + season: 1 + episodeNumber: 17 + format: HDTV + releaseGroup: LOL + +? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi +: type: episode + series: The Office (US) + country: US + season: 1 + episodeNumber: 3 + title: Health Care + format: HDTV + videoCodec: XviD + releaseGroup: LOL + +? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 +: type: movie + title: The Insider + year: 1999 + bonusNumber: 2 + bonusTitle: 60 Minutes Interview-1996 + +? OSS_117--Cairo,_Nest_of_Spies.mkv +: type: movie + title: OSS 117--Cairo, Nest of Spies + +? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv +: type: movie + title: Rush Beyond The Lighted Stage + bonusNumber: 9 + bonusTitle: Between Sun and Moon-2002 Hartford + +? House.Hunters.International.S56E06.720p.hdtv.x264.mp4 +: type: episode + series: House Hunters International + season: 56 + episodeNumber: 6 + screenSize: 720p + format: HDTV + videoCodec: h264 + +? White.House.Down.2013.1080p.BluRay.DTS-HD.MA.5.1.x264-PublicHD.mkv +: type: movie + title: White House Down + year: 2013 + screenSize: 1080p + format: BluRay + audioCodec: DTS + audioProfile: HDMA + videoCodec: h264 + releaseGroup: PublicHD + audioChannels: "5.1" + +? White.House.Down.2013.1080p.BluRay.DTSHD.MA.5.1.x264-PublicHD.mkv +: type: movie + title: White House Down + year: 2013 + screenSize: 1080p + format: BluRay + audioCodec: DTS + audioProfile: HDMA + videoCodec: h264 + releaseGroup: PublicHD + audioChannels: "5.1" + +? Hostages.S01E01.Pilot.for.Air.720p.WEB-DL.DD5.1.H.264-NTb.nfo +: type: episodeinfo + series: Hostages + title: Pilot for Air + season: 1 + episodeNumber: 1 + screenSize: 720p + format: WEB-DL + audioChannels: "5.1" + videoCodec: h264 + audioCodec: DolbyDigital + releaseGroup: NTb + +? Despicable.Me.2.2013.1080p.BluRay.x264-VeDeTT.nfo +: type: movieinfo + title: Despicable Me 2 + year: 2013 + screenSize: 1080p + format: BluRay + videoCodec: h264 + releaseGroup: VeDeTT + +? Le Cinquieme Commando 1971 SUBFORCED FRENCH DVDRiP XViD AC3 Bandix.mkv +: type: movie + audioCodec: AC3 + format: DVD + releaseGroup: Bandix + subtitleLanguage: French + title: Le Cinquieme Commando + videoCodec: XviD + year: 1971 + +? Le Seigneur des Anneaux - La Communauté de l'Anneau - Version Longue - BDRip.mkv +: type: movie + format: BluRay + title: Le Seigneur des Anneaux + +? La petite bande (Michel Deville - 1983) VF PAL MP4 x264 AAC.mkv +: type: movie + audioCodec: AAC + language: French + title: La petite bande + videoCodec: h264 + year: 1983 + +? Retour de Flammes (Gregor Schnitzler 2003) FULL DVD.iso +: type: movie + format: DVD + title: Retour de Flammes + type: movie + year: 2003 + +? A.Common.Title.Special.2014.avi +: type: movie + year: 2014 + title: A Common Title Special + +? A.Common.Title.2014.Special.avi +: type: episode + year: 2014 + series: A Common Title + title: Special + episodeDetails: Special + +? A.Common.Title.2014.Special.Edition.avi +: type: movie + year: 2014 + title: A Common Title + edition: Special Edition + +? Downton.Abbey.2013.Christmas.Special.HDTV.x264-FoV.mp4 +: type: episode + year: 2013 + series: Downton Abbey + title: Christmas Special + videoCodec: h264 + releaseGroup: FoV + format: HDTV + episodeDetails: Special + +? Doctor_Who_2013_Christmas_Special.The_Time_of_The_Doctor.HD +: options: -n + type: episode + series: Doctor Who + other: HD + episodeDetails: Special + title: Christmas Special The Time of The Doctor + year: 2013 + +? Doctor Who 2005 50th Anniversary Special The Day of the Doctor 3.avi +: type: episode + series: Doctor Who + episodeDetails: Special + title: 50th Anniversary Special The Day of the Doctor 3 + year: 2005 + +? Robot Chicken S06-Born Again Virgin Christmas Special HDTV x264.avi +: type: episode + series: Robot Chicken + format: HDTV + season: 6 + title: Born Again Virgin Christmas Special + videoCodec: h264 + episodeDetails: Special + +? Wicked.Tuna.S03E00.Head.To.Tail.Special.HDTV.x264-YesTV +: options: -n + type: episode + series: Wicked Tuna + title: Head To Tail Special + releaseGroup: YesTV + season: 3 + episodeNumber: 0 + videoCodec: h264 + format: HDTV + episodeDetails: Special + +? The.Voice.UK.S03E12.HDTV.x264-C4TV +: options: -n + episodeNumber: 12 + videoCodec: h264 + format: HDTV + series: The Voice (UK) + releaseGroup: C4TV + season: 3 + country: United Kingdom + type: episode + +? /tmp/star.trek.9/star.trek.9.mkv +: type: movie + title: star trek 9 + +? star.trek.9.mkv +: type: movie + title: star trek 9 + +? FlexGet.S01E02.TheName.HDTV.xvid +: options: -n + episodeNumber: 2 + format: HDTV + season: 1 + series: FlexGet + title: TheName + type: episode + videoCodec: XviD + +? FlexGet.S01E02.TheName.HDTV.xvid +: options: -n + episodeNumber: 2 + format: HDTV + season: 1 + series: FlexGet + title: TheName + type: episode + videoCodec: XviD + +? some.series.S03E14.Title.Here.720p +: options: -n + episodeNumber: 14 + screenSize: 720p + season: 3 + series: some series + title: Title Here + type: episode + +? '[the.group] Some.Series.S03E15.Title.Two.720p' +: options: -n + episodeNumber: 15 + releaseGroup: the.group + screenSize: 720p + season: 3 + series: Some Series + title: Title Two + type: episode + +? 'HD 720p: Some series.S03E16.Title.Three' +: options: -n + episodeNumber: 16 + other: HD + screenSize: 720p + season: 3 + series: Some series + title: Title Three + type: episode + +? Something.Season.2.1of4.Ep.Title.HDTV.torrent +: episodeCount: 4 + episodeNumber: 1 + format: HDTV + season: 2 + series: Something + title: Title + type: episode + +? Show-A (US) - Episode Title S02E09 hdtv +: options: -n + country: US + episodeNumber: 9 + format: HDTV + season: 2 + series: Show-A (US) + type: episode + +? Jack's.Show.S03E01.blah.1080p +: options: -n + episodeNumber: 1 + screenSize: 1080p + season: 3 + series: Jack's Show + title: blah + type: episode + +? FlexGet.epic +: options: -n + title: FlexGet epic + type: movie + +? FlexGet.Apt.1 +: options: -n + title: FlexGet Apt 1 + type: movie + +? FlexGet.aptitude +: options: -n + title: FlexGet aptitude + type: movie + +? FlexGet.Step1 +: options: -n + title: FlexGet Step1 + type: movie + +? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720 * 432].avi +: format: DVD + screenSize: 720x432 + title: El Bosque Animado + videoCodec: XviD + year: 1987 + type: movie + +? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi +: format: DVD + screenSize: 720x432 + title: El Bosque Animado + videoCodec: XviD + year: 1987 + type: movie + +? 2009.shoot.fruit.chan.multi.dvd9.pal +: options: -n + format: DVD + language: mul + other: PAL + title: shoot fruit chan + type: movie + year: 2009 + +? 2009.shoot.fruit.chan.multi.dvd5.pal +: options: -n + format: DVD + language: mul + other: PAL + title: shoot fruit chan + type: movie + year: 2009 + +? The.Flash.2014.S01E01.PREAIR.WEBRip.XviD-EVO.avi +: episodeNumber: 1 + format: WEBRip + other: Preair + releaseGroup: EVO + season: 1 + series: The Flash + type: episode + videoCodec: XviD + year: 2014 + +? Ice.Lake.Rebels.S01E06.Ice.Lake.Games.720p.HDTV.x264-DHD +: options: -n + episodeNumber: 6 + format: HDTV + releaseGroup: DHD + screenSize: 720p + season: 1 + series: Ice Lake Rebels + title: Ice Lake Games + type: episode + videoCodec: h264 + +? The League - S06E10 - Epi Sexy.mkv +: episodeNumber: 10 + season: 6 + series: The League + title: Epi Sexy + type: episode + +? Stay (2005) [1080p]/Stay.2005.1080p.BluRay.x264.YIFY.mp4 +: format: BluRay + releaseGroup: YIFY + screenSize: 1080p + title: Stay + type: movie + videoCodec: h264 + year: 2005 \ No newline at end of file diff --git a/libs/guessit/test/dummy.srt b/libs/guessit/test/dummy.srt new file mode 100644 index 00000000..ca4cf8b8 --- /dev/null +++ b/libs/guessit/test/dummy.srt @@ -0,0 +1 @@ +Just a dummy srt file (used for unittests: do not remove!) diff --git a/libs/guessit/test/episodes.yaml b/libs/guessit/test/episodes.yaml new file mode 100644 index 00000000..afba6e74 --- /dev/null +++ b/libs/guessit/test/episodes.yaml @@ -0,0 +1,1174 @@ +# Dubious tests +# +#? "finale " +#: releaseGroup: FiNaLe +# extension: "" + + +? Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.avi +: series: Californication + season: 2 + episodeNumber: 5 + title: Vaginatown + format: HDTV + videoCodec: XviD + releaseGroup: 0TV + +? Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi +: series: Dexter + season: 5 + episodeNumber: 2 + title: Hello, Bandit + language: English + subtitleLanguage: French + format: HDTV + videoCodec: XviD + releaseGroup: AlFleNi-TeaM + website: tvu.org.ru + +? Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi +: series: Treme + season: 1 + episodeNumber: 3 + title: Right Place, Wrong Time + format: HDTV + videoCodec: XviD + releaseGroup: NoTV + +? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi +: series: Duckman + season: 1 + episodeNumber: 1 + title: I, Duckman + date: 2002-11-07 + +? Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi +: series: Duckman + season: 1 + episodeNumber: 13 + title: Joking The Chicken + +? Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.avi +: series: The Simpsons + season: 12 + episodeNumber: 8 + title: A Bas Le Sergent Skinner + language: French + +? Series/Futurama/Season 3 (mkv)/[™] Futurama - S03E22 - Le chef de fer à 30% ( 30 Percent Iron Chef ).mkv +: series: Futurama + season: 3 + episodeNumber: 22 + title: Le chef de fer à 30% + +? Series/The Office/Season 6/The Office - S06xE01.avi +: series: The Office + season: 6 + episodeNumber: 1 + +? series/The Office/Season 4/The Office [401] Fun Run.avi +: series: The Office + season: 4 + episodeNumber: 1 + title: Fun Run + +? Series/Mad Men Season 1 Complete/Mad.Men.S01E01.avi +: series: Mad Men + season: 1 + episodeNumber: 1 + other: complete + +? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E02.65.Million.Years.Off.avi +: series: Psych + season: 2 + episodeNumber: 2 + title: 65 Million Years Off + language: english + format: DVD + other: complete + +? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E03.Psy.Vs.Psy.Français.srt +: series: Psych + season: 2 + episodeNumber: 3 + title: Psy Vs Psy + format: DVD + language: English + subtitleLanguage: French + other: complete + +? Series/Pure Laine/Pure.Laine.1x01.Toutes.Couleurs.Unies.FR.(Québec).DVB-Kceb.[tvu.org.ru].avi +: series: Pure Laine + season: 1 + episodeNumber: 1 + title: Toutes Couleurs Unies + format: DVB + releaseGroup: Kceb + language: french + website: tvu.org.ru + +? Series/Pure Laine/2x05 - Pure Laine - Je Me Souviens.avi +: series: Pure Laine + season: 2 + episodeNumber: 5 + title: Je Me Souviens + +? Series/Tout sur moi/Tout sur moi - S02E02 - Ménage à trois (14-01-2008) [Rip by Ampli].avi +: series: Tout sur moi + season: 2 + episodeNumber: 2 + title: Ménage à trois + date: 2008-01-14 + +? The.Mentalist.2x21.18-5-4.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi +: series: The Mentalist + season: 2 + episodeNumber: 21 + title: 18-5-4 + language: english + subtitleLanguage: french + format: HDTV + videoCodec: Xvid + releaseGroup: AlFleNi-TeaM + website: tvu.org.ru + +? series/__ Incomplete __/Dr Slump (Catalan)/Dr._Slump_-_003_DVB-Rip_Catalan_by_kelf.avi +: series: Dr Slump + episodeNumber: 3 + format: DVB + language: catalan + +? series/Ren and Stimpy - Black_hole_[DivX].avi +: series: Ren and Stimpy + title: Black hole + videoCodec: DivX + +? Series/Walt Disney/Donald.Duck.-.Good.Scouts.[www.bigernie.jump.to].avi +: series: Donald Duck + title: Good Scouts + website: www.bigernie.jump.to + +? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi +: series: Neverwhere + episodeNumber: 5 + title: Down Street + website: tvu.org.ru + +? Series/South Park/Season 4/South.Park.4x07.Cherokee.Hair.Tampons.DVDRip.[tvu.org.ru].avi +: series: South Park + season: 4 + episodeNumber: 7 + title: Cherokee Hair Tampons + format: DVD + website: tvu.org.ru + +? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi +: series: Kaamelott + episodeNumber: 23 + title: Le Forfait + +? Series/Duckman/Duckman - 110 (10) - 20021218 - Cellar Beware.avi +: series: Duckman + season: 1 + episodeNumber: 10 + date: 2002-12-18 + title: Cellar Beware + +? Series/Ren & Stimpy/Ren And Stimpy - Onward & Upward-Adult Party Cartoon.avi +: series: Ren And Stimpy + title: Onward & Upward-Adult Party Cartoon + +? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi +: series: Breaking Bad + episodeFormat: Minisode + episodeNumber: 1 + title: Good Cop Bad Cop + format: WEBRip + videoCodec: XviD + +? Series/My Name Is Earl/My.Name.Is.Earl.S01Extras.-.Bad.Karma.DVDRip.XviD.avi +: series: My Name Is Earl + season: 1 + title: Bad Karma + format: DVD + episodeDetails: Extras + videoCodec: XviD + +? series/Freaks And Geeks/Season 1/Episode 4 - Kim Kelly Is My Friend-eng(1).srt +: series: Freaks And Geeks + season: 1 + episodeNumber: 4 + title: Kim Kelly Is My Friend + language: English + +? /mnt/series/The Big Bang Theory/S01/The.Big.Bang.Theory.S01E01.mkv +: series: The Big Bang Theory + season: 1 + episodeNumber: 1 + +? /media/Parks_and_Recreation-s03-e01.mkv +: series: Parks and Recreation + season: 3 + episodeNumber: 1 + +? /media/Parks_and_Recreation-s03-e02-Flu_Season.mkv +: series: Parks and Recreation + season: 3 + title: Flu Season + episodeNumber: 2 + +? /media/Parks_and_Recreation-s03-x01.mkv +: series: Parks and Recreation + season: 3 + bonusNumber: 1 + +? /media/Parks_and_Recreation-s03-x02-Gag_Reel.mkv +: series: Parks and Recreation + season: 3 + bonusNumber: 2 + bonusTitle: Gag Reel + +? /media/Band_of_Brothers-e01-Currahee.mkv +: series: Band of Brothers + episodeNumber: 1 + title: Currahee + +? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv +: series: Band of Brothers + bonusNumber: 2 + bonusTitle: We Stand Alone Together + +? /TV Shows/Mad.M-5x9.mkv +: series: Mad M + season: 5 + episodeNumber: 9 + +? /TV Shows/new.girl.117.hdtv-lol.mp4 +: series: New Girl + season: 1 + episodeNumber: 17 + format: HDTV + releaseGroup: LOL + +? Kaamelott - 5x44x45x46x47x48x49x50.avi +: series: Kaamelott + season: 5 + episodeNumber: 44 + episodeList: [44, 45, 46, 47, 48, 49, 50] + +? Example S01E01-02.avi +: series: Example + season: 1 + episodeNumber: 1 + episodeList: [1, 2] + +? Example S01E01E02.avi +: series: Example + season: 1 + episodeNumber: 1 + episodeList: [1, 2] + +? Series/Baccano!/Baccano!_-_T1_-_Trailer_-_[Ayu](dae8173e).mkv +: series: Baccano! + other: Trailer + releaseGroup: Ayu + title: T1 + crc32: dae8173e + +? Series/Doctor Who (2005)/Season 06/Doctor Who (2005) - S06E01 - The Impossible Astronaut (1).avi +: series: Doctor Who + year: 2005 + season: 6 + episodeNumber: 1 + title: The Impossible Astronaut + +? Parks and Recreation - [04x12] - Ad Campaign.avi +: series: Parks and Recreation + season: 4 + episodeNumber: 12 + title: Ad Campaign + +? The Sopranos - [05x07] - In Camelot.mp4 +: series: The Sopranos + season: 5 + episodeNumber: 7 + title: In Camelot + +? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi +: series: The Office (US) + country: US + season: 1 + episodeNumber: 3 + title: Health Care + format: HDTV + videoCodec: XviD + releaseGroup: LOL + +? /Volumes/data-1/Series/Futurama/Season 3/Futurama_-_S03_DVD_Bonus_-_Deleted_Scenes_Part_3.ogm +: series: Futurama + season: 3 + part: 3 + other: Bonus + title: Deleted Scenes + format: DVD + +? Ben.and.Kate.S01E02.720p.HDTV.X264-DIMENSION.mkv +: series: Ben and Kate + season: 1 + episodeNumber: 2 + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: DIMENSION + +? /volume1/TV Series/Drawn Together/Season 1/Drawn Together 1x04 Requiem for a Reality Show.avi +: series: Drawn Together + season: 1 + episodeNumber: 4 + title: Requiem for a Reality Show + +? Sons.of.Anarchy.S05E06.720p.WEB.DL.DD5.1.H.264-CtrlHD.mkv +: series: Sons of Anarchy + season: 5 + episodeNumber: 6 + screenSize: 720p + format: WEB-DL + audioChannels: "5.1" + audioCodec: DolbyDigital + videoCodec: h264 + releaseGroup: CtrlHD + +? /media/bdc64bfe-e36f-4af8-b550-e6fd2dfaa507/TV_Shows/Doctor Who (2005)/Saison 6/Doctor Who (2005) - S06E13 - The Wedding of River Song.mkv +: series: Doctor Who + season: 6 + episodeNumber: 13 + year: 2005 + title: The Wedding of River Song + idNumber: bdc64bfe-e36f-4af8-b550-e6fd2dfaa507 + +? /mnt/videos/tvshows/Doctor Who/Season 06/E13 - The Wedding of River Song.mkv +: series: Doctor Who + season: 6 + episodeNumber: 13 + title: The Wedding of River Song + +? The.Simpsons.S24E03.Adventures.in.Baby-Getting.720p.WEB-DL.DD5.1.H.264-CtrlHD.mkv +: series: The Simpsons + season: 24 + episodeNumber: 3 + title: Adventures in Baby-Getting + screenSize: 720p + format: WEB-DL + audioChannels: "5.1" + audioCodec: DolbyDigital + videoCodec: h264 + releaseGroup: CtrlHD + +? /home/disaster/Videos/TV/Merlin/merlin_2008.5x02.arthurs_bane_part_two.repack.720p_hdtv_x264-fov.mkv +: series: Merlin + season: 5 + episodeNumber: 2 + part: 2 + title: Arthurs bane + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: Fov + year: 2008 + other: Proper + +? "Da Vinci's Demons - 1x04 - The Magician.mkv" +: series: "Da Vinci's Demons" + season: 1 + episodeNumber: 4 + title: The Magician + +? CSI.S013E18.Sheltered.720p.WEB-DL.DD5.1.H.264.mkv +: series: CSI + season: 13 + episodeNumber: 18 + title: Sheltered + screenSize: 720p + format: WEB-DL + audioChannels: "5.1" + audioCodec: DolbyDigital + videoCodec: h264 + +? Game of Thrones S03E06 1080i HDTV DD5.1 MPEG2-TrollHD.ts +: series: Game of Thrones + season: 3 + episodeNumber: 6 + screenSize: 1080i + format: HDTV + audioChannels: "5.1" + audioCodec: DolbyDigital + videoCodec: MPEG2 + releaseGroup: TrollHD + +? gossip.girl.s01e18.hdtv.xvid-2hd.eng.srt +: series: gossip girl + season: 1 + episodeNumber: 18 + format: HDTV + videoCodec: XviD + releaseGroup: 2HD + subtitleLanguage: english + +? Wheels.S03E01E02.720p.HDTV.x264-IMMERSE.mkv +: series: Wheels + season: 3 + episodeNumber: 1 + episodeList: [1, 2] + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: IMMERSE + +? Wheels.S03E01-02.720p.HDTV.x264-IMMERSE.mkv +: series: Wheels + season: 3 + episodeNumber: 1 + episodeList: [1, 2] + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: IMMERSE + +? Wheels.S03E01-E02.720p.HDTV.x264-IMMERSE.mkv +: series: Wheels + season: 3 + episodeNumber: 1 + episodeList: [1, 2] + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: IMMERSE + +? Wheels.S03E01-03.720p.HDTV.x264-IMMERSE.mkv +: series: Wheels + season: 3 + episodeNumber: 1 + episodeList: [1, 2, 3] + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: IMMERSE + +? Marvels.Agents.of.S.H.I.E.L.D.S01E06.720p.HDTV.X264-DIMENSION.mkv +: series: Marvels Agents of S.H.I.E.L.D. + season: 1 + episodeNumber: 6 + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: DIMENSION + +? Marvels.Agents.of.S.H.I.E.L.D..S01E06.720p.HDTV.X264-DIMENSION.mkv +: series: Marvels Agents of S.H.I.E.L.D. + season: 1 + episodeNumber: 6 + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: DIMENSION + +? Series/Friday Night Lights/Season 1/Friday Night Lights S01E19 - Ch-Ch-Ch-Ch-Changes.avi +: series: Friday Night Lights + season: 1 + episodeNumber: 19 + title: Ch-Ch-Ch-Ch-Changes + +? Dexter Saison VII FRENCH.BDRip.XviD-MiND.nfo +: series: Dexter + season: 7 + videoCodec: XviD + language: French + format: BluRay + releaseGroup: MiND + +? Dexter Saison sept FRENCH.BDRip.XviD-MiND.nfo +: series: Dexter + season: 7 + videoCodec: XviD + language: French + format: BluRay + releaseGroup: MiND + +? "Pokémon S16 - E29 - 1280*720 HDTV VF.mkv" +: series: Pokémon + format: HDTV + language: French + season: 16 + episodeNumber: 29 + screenSize: 720p + +? One.Piece.E576.VOSTFR.720p.HDTV.x264-MARINE-FORD.mkv +: episodeNumber: 576 + videoCodec: h264 + format: HDTV + series: One Piece + releaseGroup: MARINE-FORD + subtitleLanguage: French + screenSize: 720p + +? Dexter.S08E12.FINAL.MULTi.1080p.BluRay.x264-MiND.mkv +: videoCodec: h264 + episodeNumber: 12 + season: 8 + format: BluRay + series: Dexter + other: final + language: Multiple languages + releaseGroup: MiND + screenSize: 1080p + +? One Piece - E623 VOSTFR HD [www.manga-ddl-free.com].mkv +: website: www.manga-ddl-free.com + episodeNumber: 623 + subtitleLanguage: French + series: One Piece + other: HD + +? Falling Skies Saison 1.HDLight.720p.x264.VFF.mkv +: language: French + screenSize: 720p + season: 1 + series: Falling Skies + videoCodec: h264 + other: HDLight + +? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BP.mkv +: episodeNumber: 9 + videoCodec: h264 + format: WEB-DL + series: Sleepy Hollow + audioChannels: "5.1" + screenSize: 720p + season: 1 + videoProfile: BP + audioCodec: DolbyDigital + +? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BS.mkv +: episodeNumber: 9 + videoCodec: h264 + format: WEB-DL + series: Sleepy Hollow + audioChannels: "5.1" + screenSize: 720p + season: 1 + releaseGroup: BS + audioCodec: DolbyDigital + +? Battlestar.Galactica.S00.Pilot.FRENCH.DVDRip.XviD-NOTAG.avi +: series: Battlestar Galactica + season: 0 + title: Pilot + episodeDetails: Pilot + language: French + format: DVD + videoCodec: XviD + releaseGroup: NOTAG + +? The Big Bang Theory S00E00 Unaired Pilot VOSTFR TVRip XviD-VioCs +: options: -n + series: The Big Bang Theory + season: 0 + episodeNumber: 0 + subtitleLanguage: French + format: TV + videoCodec: XviD + releaseGroup: VioCs + episodeDetails: [Unaired, Pilot] + title: Unaired Pilot + +? The Big Bang Theory S01E00 PROPER Unaired Pilot TVRip XviD-GIGGITY +: options: -n + series: The Big Bang Theory + season: 1 + episodeNumber: 0 + format: TV + videoCodec: XviD + releaseGroup: GIGGITY + other: proper + episodeDetails: [Unaired, Pilot] + title: Unaired Pilot + +? Pawn.Stars.S2014E18.720p.HDTV.x264-KILLERS +: options: -n + series: Pawn Stars + season: 2014 + year: 2014 + episodeNumber: 18 + screenSize: 720p + format: HDTV + videoCodec: h264 + releaseGroup: KILLERS + +? 2.Broke.Girls.S03E10.480p.HDTV.x264-mSD.mkv +: series: 2 Broke Girls + season: 3 + episodeNumber: 10 + screenSize: 480p + format: HDTV + videoCodec: h264 + releaseGroup: mSD + +? House.of.Cards.2013.S02E03.1080p.NF.WEBRip.DD5.1.x264-NTb.mkv +: series: House of Cards + year: 2013 + season: 2 + episodeNumber: 3 + screenSize: 1080p + other: Netflix + format: Webrip + audioChannels: "5.1" + audioCodec: DolbyDigital + videoCodec: h264 + releaseGroup: NTb + +? the.100.109.hdtv-lol.mp4 +: series: the 100 + season: 1 + episodeNumber: 9 + format: HDTV + releaseGroup: lol + +? 03-Criminal.Minds.5x03.Reckoner.ENG.-.sub.FR.HDTV.XviD-STi.[tvu.org.ru].avi +: series: Criminal Minds + language: English + subtitleLanguage: French + season: 5 + episodeNumber: 3 + videoCodec: XviD + format: HDTV + website: tvu.org.ru + releaseGroup: STi + title: Reckoner + +? 03-Criminal.Minds.avi +: series: Criminal Minds + episodeNumber: 3 + +? '[Evil-Saizen]_Laughing_Salesman_14_[DVD][1C98686A].mkv' +: crc32: 1C98686A + episodeNumber: 14 + format: DVD + releaseGroup: Evil-Saizen + series: Laughing Salesman + +? '[Kaylith] Zankyou no Terror - 04 [480p][B4D4514E].mp4' +: crc32: B4D4514E + episodeNumber: 4 + releaseGroup: Kaylith + screenSize: 480p + series: Zankyou no Terror + +? '[PuyaSubs!] Seirei Tsukai no Blade Dance - 05 [720p][32DD560E].mkv' +: crc32: 32DD560E + episodeNumber: 5 + releaseGroup: PuyaSubs! + screenSize: 720p + series: Seirei Tsukai no Blade Dance + +? '[Doremi].Happiness.Charge.Precure.27.[1280x720].[DC91581A].mkv' +: crc32: DC91581A + episodeNumber: 27 + releaseGroup: Doremi + screenSize: 720p + series: Happiness Charge Precure + +? "[Daisei] Free!:Iwatobi Swim Club - 01 ~ (BD 720p 10-bit AAC) [99E8E009].mkv" +: audioCodec: AAC + crc32: 99E8E009 + episodeNumber: 1 + format: BluRay + releaseGroup: Daisei + screenSize: 720p + series: Free!:Iwatobi Swim Club + videoProfile: 10bit + +? '[Tsundere] Boku wa Tomodachi ga Sukunai - 03 [BDRip h264 1920x1080 10bit FLAC][AF0C22CC].mkv' +: audioCodec: Flac + crc32: AF0C22CC + episodeNumber: 3 + format: BluRay + releaseGroup: Tsundere + screenSize: 1080p + series: Boku wa Tomodachi ga Sukunai + videoCodec: h264 + videoProfile: 10bit + +? '[t.3.3.d]_Mikakunin_de_Shinkoukei_-_12_[720p][5DDC1352].mkv' +: crc32: 5DDC1352 + episodeNumber: 12 + screenSize: 720p + series: Mikakunin de Shinkoukei + releaseGroup: t.3.3.d + +? '[Anime-Koi] Sabagebu! - 06 [h264-720p][ABB3728A].mkv' +: crc32: ABB3728A + episodeNumber: 6 + releaseGroup: Anime-Koi + screenSize: 720p + series: Sabagebu! + videoCodec: h264 + +? '[aprm-Diogo4D] [BD][1080p] Nagi no Asukara 08 [4D102B7C].mkv' +: crc32: 4D102B7C + episodeNumber: 8 + format: BluRay + releaseGroup: aprm-Diogo4D + screenSize: 1080p + series: Nagi no Asukara + +? '[Akindo-SSK] Zankyou no Terror - 05 [720P][Sub_ITA][F5CCE87C].mkv' +: crc32: F5CCE87C + episodeNumber: 5 + releaseGroup: Akindo-SSK + screenSize: 720p + series: Zankyou no Terror + subtitleLanguage: it + +? Naruto Shippuden Episode 366 VOSTFR.avi +: episodeNumber: 366 + series: Naruto Shippuden + subtitleLanguage: fr + +? Naruto Shippuden Episode 366v2 VOSTFR.avi +: episodeNumber: 366 + version: 2 + series: Naruto Shippuden + subtitleLanguage: fr + +? '[HorribleSubs] Ao Haru Ride - 06 [480p].mkv' +: episodeNumber: 6 + releaseGroup: HorribleSubs + screenSize: 480p + series: Ao Haru Ride + +? '[DeadFish] Tari Tari - 01 [BD][720p][AAC].mp4' +: audioCodec: AAC + episodeNumber: 1 + format: BluRay + releaseGroup: DeadFish + screenSize: 720p + series: Tari Tari + +? '[NoobSubs] Sword Art Online II 06 (720p 8bit AAC).mp4' +: audioCodec: AAC + episodeNumber: 6 + releaseGroup: NoobSubs + screenSize: 720p + series: Sword Art Online II + videoProfile: 8bit + +? '[DeadFish] 01 - Tari Tari [BD][720p][AAC].mp4' +: audioCodec: AAC + episodeNumber: 1 + format: BluRay + releaseGroup: DeadFish + screenSize: 720p + series: Tari Tari + +? '[NoobSubs] 06 Sword Art Online II (720p 8bit AAC).mp4' +: audioCodec: AAC + episodeNumber: 6 + releaseGroup: NoobSubs + screenSize: 720p + series: Sword Art Online II + videoProfile: 8bit + +? '[DeadFish] 12 - Tari Tari [BD][720p][AAC].mp4' +: audioCodec: AAC + episodeNumber: 12 + format: BluRay + releaseGroup: DeadFish + screenSize: 720p + series: Tari Tari + +? Something.Season.2.1of4.Ep.Title.HDTV.torrent +: episodeCount: 4 + episodeNumber: 1 + format: HDTV + season: 2 + series: Something + title: Title + extension: torrent + +? Something.Season.2of5.3of9.Ep.Title.HDTV.torrent +: episodeCount: 9 + episodeNumber: 3 + format: HDTV + season: 2 + seasonCount: 5 + series: Something + title: Title + extension: torrent + +? Something.Other.Season.3of5.Complete.HDTV.torrent +: format: HDTV + other: Complete + season: 3 + seasonCount: 5 + series: Something Other + extension: torrent + +? Something.Other.Season.1-3.avi +: season: 1 + seasonList: + - 1 + - 2 + - 3 + series: Something Other + +? Something.Other.Season.1&3.avi +: season: 1 + seasonList: + - 1 + - 3 + series: Something Other + +? Something.Other.Season.1&3-1to12ep.avi +: season: 1 + seasonList: + - 1 + - 3 + series: Something Other + +? Something.Other.saison 1 2 & 4 a 7.avi +: season: 1 + seasonList: + - 1 + - 2 + - 4 + - 5 + - 6 + - 7 + series: Something Other + +? W2Test.123.HDTV.XViD-FlexGet +: options: -n + episodeNumber: 23 + season: 1 + format: HDTV + releaseGroup: FlexGet + series: W2Test + videoCodec: XviD + +? W2Test.123.HDTV.XViD-FlexGet +: options: -n --episode-prefer-number + episodeNumber: 123 + format: HDTV + releaseGroup: FlexGet + series: W2Test + videoCodec: XviD + +? FooBar.0307.PDTV-FlexGet +: options: -n --episode-prefer-number + episodeNumber: 7 + format: DVB + releaseGroup: FlexGet + season: 3 + series: FooBar + +? FooBar.307.PDTV-FlexGet +: options: -n --episode-prefer-number + episodeNumber: 307 + format: DVB + releaseGroup: FlexGet + series: FooBar + +? FooBar.07.PDTV-FlexGet +: options: -n --episode-prefer-number + episodeNumber: 7 + format: DVB + releaseGroup: FlexGet + series: FooBar + +? FooBar.7.PDTV-FlexGet +: options: -n -t episode --episode-prefer-number + episodeNumber: 7 + format: DVB + releaseGroup: FlexGet + series: FooBar + +? FooBar.0307.PDTV-FlexGet +: options: -n + episodeNumber: 7 + format: DVB + releaseGroup: FlexGet + season: 3 + series: FooBar + +? FooBar.307.PDTV-FlexGet +: options: -n + episodeNumber: 7 + format: DVB + releaseGroup: FlexGet + season: 3 + series: FooBar + +? FooBar.07.PDTV-FlexGet +: options: -n + episodeNumber: 7 + format: DVB + releaseGroup: FlexGet + series: FooBar + +? FooBar.07v4.PDTV-FlexGet +: options: -n + episodeNumber: 7 + version: 4 + format: DVB + releaseGroup: FlexGet + series: FooBar + +? FooBar.7.PDTV-FlexGet +: options: -n -t episode + format: DVB + releaseGroup: FlexGet + series: FooBar 7 + +? FooBar.7v3.PDTV-FlexGet +: options: -n -t episode + episodeNumber: 7 + version: 3 + format: DVB + releaseGroup: FlexGet + series: FooBar + +? Test.S02E01.hdtv.real.proper +: options: -n + episodeNumber: 1 + format: HDTV + other: Proper + properCount: 2 + season: 2 + series: Test + +? Real.Test.S02E01.hdtv.proper +: options: -n + episodeNumber: 1 + format: HDTV + other: Proper + properCount: 1 + season: 2 + series: Real Test + +? Test.Real.S02E01.hdtv.proper +: options: -n + episodeNumber: 1 + format: HDTV + other: Proper + properCount: 1 + season: 2 + series: Test Real + +? Test.S02E01.hdtv.proper +: options: -n + episodeNumber: 1 + format: HDTV + other: Proper + properCount: 1 + season: 2 + series: Test + +? Test.S02E01.hdtv.real.repack.proper +: options: -n + episodeNumber: 1 + format: HDTV + other: Proper + properCount: 3 + season: 2 + series: Test + +? Date.Show.03-29-2012.HDTV.XViD-FlexGet +: options: -n + date: 2012-03-29 + format: HDTV + releaseGroup: FlexGet + series: Date Show + videoCodec: XviD + +? Something.1x5.Season.Complete-FlexGet +: options: -n + episodeNumber: 5 + other: Complete + season: 1 + series: Something + releaseGroup: FlexGet + +? Something Seasons 1 & 2 - Complete +: options: -n + other: Complete + season: 1 + seasonList: + - 1 + - 2 + series: Something + +? Something Seasons 4 Complete +: options: -n + other: Complete + season: 4 + series: Something + +? Something.1xAll.Season.Complete-FlexGet +: options: -n + other: Complete + season: 1 + series: Something + releaseGroup: FlexGet + +? Something.1xAll-FlexGet +: options: -n + other: Complete + season: 1 + series: Something + releaseGroup: FlexGet + +? FlexGet.US.S2013E14.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: options: -n + audioChannels: '5.1' + audioCodec: AAC + country: US + episodeNumber: 14 + format: HDTV + releaseGroup: NOGRP + screenSize: 720p + season: 2013 + series: FlexGet (US) + title: Title Here + videoCodec: h264 + year: 2013 + +? FlexGet.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: options: -n + audioChannels: '5.1' + audioCodec: AAC + episodeCount: 21 + episodeNumber: 14 + format: HDTV + releaseGroup: NOGRP + screenSize: 720p + series: FlexGet + title: Title Here + videoCodec: h264 + +? FlexGet.Series.2013.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: options: -n + audioChannels: '5.1' + audioCodec: AAC + episodeCount: 21 + episodeNumber: 14 + format: HDTV + releaseGroup: NOGRP + screenSize: 720p + season: 2013 + series: FlexGet + title: Title Here + videoCodec: h264 + year: 2013 + +? Something.S04E05E09 +: options: -n + episodeList: + - 5 + - 6 + - 7 + - 8 + - 9 + episodeNumber: 5 + season: 4 + series: Something + +? FooBar 360 1080i +: options: -n -t episode --episode-prefer-number + episodeNumber: 360 + screenSize: 1080i + series: FooBar + +? FooBar 360 1080i +: options: -n -t episode + episodeNumber: 60 + season: 3 + screenSize: 1080i + series: FooBar + +? FooBar 360 +: options: -n -t episode + screenSize: 360p + series: FooBar + +? BarFood christmas special HDTV +: options: -n -t episode --expected-series BarFood + format: HDTV + series: BarFood + title: christmas special + episodeDetails: Special + +? Something.2008x12.13-FlexGet +: options: -n -t episode + series: Something + date: 2008-12-13 + title: FlexGet + +? '[Ignored] Test 12' +: options: -n + episodeNumber: 12 + releaseGroup: Ignored + series: Test + +? '[FlexGet] Test 12' +: options: -n + episodeNumber: 12 + releaseGroup: FlexGet + series: Test + +? Test.13.HDTV-Ignored +: options: -n + episodeNumber: 13 + format: HDTV + releaseGroup: Ignored + series: Test + +? Test.13.HDTV-Ignored +: options: -n --expected-series test + episodeNumber: 13 + format: HDTV + releaseGroup: Ignored + series: Test + +? Test.13.HDTV-Ignored +: series: Test + episodeNumber: 13 + format: HDTV + releaseGroup: Ignored + +? Test.13.HDTV-Ignored +: options: -n --expected-group "Name;FlexGet" + episodeNumber: 13 + format: HDTV + releaseGroup: Ignored + series: Test + +? Test.13.HDTV-FlexGet +: options: -n + episodeNumber: 13 + format: HDTV + releaseGroup: FlexGet + series: Test + +? Test.14.HDTV-Name +: options: -n + episodeNumber: 14 + format: HDTV + releaseGroup: Name + series: Test + +? Real.Time.With.Bill.Maher.2014.10.31.HDTV.XviD-AFG.avi +: date: 2014-10-31 + format: HDTV + releaseGroup: AFG + series: Real Time With Bill Maher + videoCodec: XviD diff --git a/libs/guessit/test/guessittest.py b/libs/guessit/test/guessittest.py new file mode 100644 index 00000000..1e9374f0 --- /dev/null +++ b/libs/guessit/test/guessittest.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit import base_text_type, u +from collections import defaultdict +from unittest import TestCase, TestLoader, TextTestRunner +import shlex +import babelfish +import yaml, logging, sys, os +from os.path import * + + +def currentPath(): + '''Returns the path in which the calling file is located.''' + return dirname(join(os.getcwd(), sys._getframe(1).f_globals['__file__'])) + + +def addImportPath(path): + '''Function that adds the specified path to the import path. The path can be + absolute or relative to the calling file.''' + importPath = abspath(join(currentPath(), path)) + sys.path = [importPath] + sys.path + +log = logging.getLogger(__name__) + +from guessit.plugins import transformers +from guessit.options import get_opts +import guessit +from guessit import * +from guessit.matcher import * +from guessit.fileutils import * + + +def allTests(testClass): + return TestLoader().loadTestsFromTestCase(testClass) + + +class TestGuessit(TestCase): + + def checkMinimumFieldsCorrect(self, filename, filetype=None, remove_type=True, + exclude_files=None): + groundTruth = yaml.load(load_file_in_same_dir(__file__, filename)) + + def guess_func(string, options=None): + return guess_file_info(string, options=options, type=filetype) + + return self.checkFields(groundTruth, guess_func, remove_type, exclude_files) + + def checkFields(self, groundTruth, guess_func, remove_type=True, + exclude_files=None): + total = 0 + exclude_files = exclude_files or [] + + fails = defaultdict(list) + additionals = defaultdict(list) + + for filename, required_fields in groundTruth.items(): + filename = u(filename) + if filename in exclude_files: + continue + + log.debug('\n' + '-' * 120) + log.info('Guessing information for file: %s' % filename) + + options = required_fields.pop('options') if 'options' in required_fields else None + + if options: + args = shlex.split(options) + options = get_opts().parse_args(args) + options = vars(options) + try: + found = guess_func(filename, options) + except Exception as e: + fails[filename].append("An exception has occured in %s: %s" % (filename, e)) + log.exception("An exception has occured in %s: %s" % (filename, e)) + continue + + total = total + 1 + + # no need for these in the unittests + if remove_type: + try: + del found['type'] + except: + pass + for prop in ('container', 'mimetype', 'unidentified'): + if prop in found: + del found[prop] + + # props which are list of just 1 elem should be opened for easier writing of the tests + for prop in ('language', 'subtitleLanguage', 'other', 'episodeDetails', 'unidentified'): + value = found.get(prop, None) + if isinstance(value, list) and len(value) == 1: + found[prop] = value[0] + + # look for missing properties + for prop, value in required_fields.items(): + if prop not in found: + log.debug("Prop '%s' not found in: %s" % (prop, filename)) + fails[filename].append("'%s' not found in: %s" % (prop, filename)) + continue + + # if both properties are strings, do a case-insensitive comparison + if (isinstance(value, base_text_type) and + isinstance(found[prop], base_text_type)): + if value.lower() != found[prop].lower(): + log.debug("Wrong prop value [str] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + + elif isinstance(value, list) and isinstance(found[prop], list): + if found[prop] and isinstance(found[prop][0], babelfish.Language): + # list of languages + s1 = set(Language.fromguessit(s) for s in value) + s2 = set(found[prop]) + else: + # by default we assume list of strings and do a case-insensitive + # comparison on their elements + s1 = set(u(s).lower() for s in value) + s2 = set(u(s).lower() for s in found[prop]) + + if s1 != s2: + log.debug("Wrong prop value [list] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + + elif isinstance(found[prop], babelfish.Language): + try: + if babelfish.Language.fromguessit(value) != found[prop]: + raise ValueError + except: + log.debug("Wrong prop value [Language] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + + elif isinstance(found[prop], babelfish.Country): + try: + if babelfish.Country.fromguessit(value) != found[prop]: + raise ValueError + except: + log.debug("Wrong prop value [Country] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) + + + # otherwise, just compare their values directly + else: + if found[prop] != value: + log.debug("Wrong prop value for '%s': expected = '%s' [%s] - received = '%s' [%s]" % (prop, u(value), type(value), u(found[prop]), type(found[prop]))) + fails[filename].append("'%s': expected = '%s' [%s] - received = '%s' [%s]" % (prop, u(value), type(value), u(found[prop]), type(found[prop]))) + + # look for additional properties + for prop, value in found.items(): + if prop not in required_fields: + log.debug("Found additional info for prop = '%s': '%s'" % (prop, u(value))) + additionals[filename].append("'%s': '%s'" % (prop, u(value))) + + correct = total - len(fails) + log.info('SUMMARY: Guessed correctly %d out of %d filenames' % (correct, total)) + + for failed_entry, failed_properties in fails.items(): + log.error('---- ' + failed_entry + ' ----') + for failed_property in failed_properties: + log.error("FAILED: " + failed_property) + + for additional_entry, additional_properties in additionals.items(): + log.warning('---- ' + additional_entry + ' ----') + for additional_property in additional_properties: + log.warning("ADDITIONAL: " + additional_property) + + self.assertTrue(correct == total, + msg='Correct: %d < Total: %d' % (correct, total)) diff --git a/libs/guessit/test/movies.yaml b/libs/guessit/test/movies.yaml new file mode 100644 index 00000000..7894ef69 --- /dev/null +++ b/libs/guessit/test/movies.yaml @@ -0,0 +1,754 @@ + +? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv +: title: Fear and Loathing in Las Vegas + year: 1998 + screenSize: 720p + format: HD-DVD + audioCodec: DTS + videoCodec: h264 + releaseGroup: ESiR + +? Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi +: title: El Dia de la Bestia + year: 1995 + format: DVD + language: spanish + videoCodec: DivX + releaseGroup: Artik[SEDG] + +? Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv +: title: Dark City + year: 1998 + format: BluRay + screenSize: 720p + audioCodec: DTS + videoCodec: h264 + releaseGroup: CHD + +? Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv +: title: Sin City + year: 2005 + format: BluRay + screenSize: 720p + videoCodec: h264 + audioCodec: AC3 + releaseGroup: SEPTiC + + +? Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi +: title: Borat + year: 2006 + other: PROPER + format: DVD + other: [ R5, Proper ] + videoCodec: XviD + releaseGroup: PUKKA + + +? "[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv" +: title: Le Prestige + format: DVD + videoCodec: h264 + videoProfile: HP + audioCodec: AAC + audioProfile: HE + language: [ french, english ] + subtitleLanguage: [ french, english ] + releaseGroup: XCT + +? Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi +: title: Battle Royale + year: 2000 + edition: special edition + cdNumber: 1 + cdNumberTotal: 2 + format: DVD + videoCodec: XviD + releaseGroup: ZeaL + +? Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.avi +: title: Brazil + edition: Criterion Edition + year: 1985 + cdNumber: 2 + +? Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv +: title: Persepolis + year: 2007 + videoCodec: h264 + audioCodec: AAC + language: [ French, English ] + subtitleLanguage: [ French, English ] + releaseGroup: XCT + +? Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv +: title: Toy Story + year: 1995 + format: HDTV + screenSize: 720p + language: [ english, spanish ] + +? Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi +: title: Office Space + year: 1999 + format: DVD + language: [ english, spanish ] + videoCodec: XviD + audioCodec: AC3 + +? Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.avi +: title: Wild Zero + year: 2000 + videoCodec: DivX + releaseGroup: EPiC + +? movies/Baraka_Edition_Collector.avi +: title: Baraka + edition: collector edition + +? Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director's.Cut).CD1.DVDRip.XviD.AC3-WAF.avi +: title: Blade Runner + year: 1982 + edition: Director's Cut + cdNumber: 1 + format: DVD + videoCodec: XviD + audioCodec: AC3 + releaseGroup: WAF + +? movies/American.The.Bill.Hicks.Story.2009.DVDRip.XviD-EPiSODE.[UsaBit.com]/UsaBit.com_esd-americanbh.avi +: title: American The Bill Hicks Story + year: 2009 + format: DVD + videoCodec: XviD + releaseGroup: EPiSODE + website: UsaBit.com + +? movies/Charlie.And.Boots.DVDRip.XviD-TheWretched/wthd-cab.avi +: title: Charlie And Boots + format: DVD + videoCodec: XviD + releaseGroup: TheWretched + +? movies/Steig Larsson Millenium Trilogy (2009) BRrip 720 AAC x264/(1)The Girl With The Dragon Tattoo (2009) BRrip 720 AAC x264.mkv +: title: The Girl With The Dragon Tattoo + filmSeries: Steig Larsson Millenium Trilogy + filmNumber: 1 + year: 2009 + format: BluRay + audioCodec: AAC + videoCodec: h264 + screenSize: 720p + +? movies/Greenberg.REPACK.LiMiTED.DVDRip.XviD-ARROW/arw-repack-greenberg.dvdrip.xvid.avi +: title: Greenberg + format: DVD + videoCodec: XviD + releaseGroup: ARROW + other: ['Proper', 'Limited'] + +? Movies/Fr - Paris 2054, Renaissance (2005) - De Christian Volckman - (Film Divx Science Fiction Fantastique Thriller Policier N&B).avi +: title: Paris 2054, Renaissance + year: 2005 + language: french + videoCodec: DivX + +? Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi +: title: Avida + year: 2006 + language: french + format: DVD + videoCodec: XviD + releaseGroup: PROD + +? Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi +: title: Alice in Wonderland + format: DVD + videoCodec: XviD + releaseGroup: DiAMOND + +? Movies/Ne.Le.Dis.A.Personne.Fr 2 cd/personnea_mp.avi +: title: Ne Le Dis A Personne + language: french + cdNumberTotal: 2 + +? Movies/Bunker Palace Hôtel (Enki Bilal) (1989)/Enki Bilal - Bunker Palace Hotel (Fr Vhs Rip).avi +: title: Bunker Palace Hôtel + year: 1989 + language: french + format: VHS + +? Movies/21 (2008)/21.(2008).DVDRip.x264.AC3-FtS.[sharethefiles.com].mkv +: title: "21" + year: 2008 + format: DVD + videoCodec: h264 + audioCodec: AC3 + releaseGroup: FtS + website: sharethefiles.com + +? Movies/9 (2009)/9.2009.Blu-ray.DTS.720p.x264.HDBRiSe.[sharethefiles.com].mkv +: title: "9" + year: 2009 + format: BluRay + audioCodec: DTS + screenSize: 720p + videoCodec: h264 + releaseGroup: HDBRiSe + website: sharethefiles.com + +? Movies/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam.avi +: title: Mamma Mia + year: 2008 + format: DVD + audioCodec: AC3 + videoCodec: XviD + releaseGroup: CrazyTeam + +? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm +: title: M.A.S.H. + year: 1970 + videoCodec: DivX + format: DVD + +? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv +: title: The Doors + year: 1991 + date: 2008-03-09 + format: BluRay + screenSize: 720p + audioCodec: AC3 + videoCodec: h264 + releaseGroup: HiS@SiLUHD + language: english + website: sharethefiles.com + +? Movies/The Doors (1991)/08.03.09.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv +: options: --date-year-first + title: The Doors + year: 1991 + date: 2008-03-09 + format: BluRay + screenSize: 720p + audioCodec: AC3 + videoCodec: h264 + releaseGroup: HiS@SiLUHD + language: english + website: sharethefiles.com + +? Movies/Ratatouille/video_ts-ratatouille.srt +: title: Ratatouille + format: DVD + +? Movies/001 __ A classer/Fantomas se déchaine - Louis de Funès.avi +: title: Fantomas se déchaine + +? Movies/Comme une Image (2004)/Comme.Une.Image.FRENCH.DVDRiP.XViD-NTK.par-www.divx-overnet.com.avi +: title: Comme une Image + year: 2004 + language: french + format: DVD + videoCodec: XviD + releaseGroup: NTK + website: www.divx-overnet.com + +? Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv +: title: Fantastic Mr Fox + year: 2009 + format: DVD + videoCodec: h264 + audioCodec: AAC + audioProfile: LC + audioChannels: "5.1" + language: [ french, english ] + subtitleLanguage: [ french, english ] + website: sharethefiles.com + +? Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi +: title: Somewhere + year: 2010 + format: DVD + videoCodec: XviD + releaseGroup: iLG + +? Movies/Moon_(2009).mkv +: title: Moon + year: 2009 + +? Movies/Moon_(2009)-x01.mkv +: title: Moon + year: 2009 + bonusNumber: 1 + +? Movies/Moon_(2009)-x02-Making_Of.mkv +: title: Moon + year: 2009 + bonusNumber: 2 + bonusTitle: Making Of + +? movies/James_Bond-f17-Goldeneye.mkv +: title: Goldeneye + filmSeries: James Bond + filmNumber: 17 + +? /movies/James_Bond-f21-Casino_Royale.mkv +: title: Casino Royale + filmSeries: James Bond + filmNumber: 21 + +? /movies/James_Bond-f21-Casino_Royale-x01-Becoming_Bond.mkv +: title: Casino Royale + filmSeries: James Bond + filmNumber: 21 + bonusNumber: 1 + bonusTitle: Becoming Bond + +? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv +: title: Casino Royale + filmSeries: James Bond + filmNumber: 21 + bonusNumber: 2 + bonusTitle: Stunts + +? OSS_117--Cairo,_Nest_of_Spies.mkv +: title: OSS 117--Cairo, Nest of Spies + +? The Godfather Part III.mkv +: title: The Godfather + part: 3 + +? Foobar Part VI.mkv +: title: Foobar + part: 6 + +? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 +: title: The Insider + year: 1999 + bonusNumber: 2 + bonusTitle: 60 Minutes Interview-1996 + +? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv +: title: Rush Beyond The Lighted Stage + bonusNumber: 9 + bonusTitle: Between Sun and Moon-2002 Hartford + +? /public/uTorrent/Downloads Finished/Movies/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX.mkv +: title: Indiana Jones and the Temple of Doom + year: 1984 + format: HDTV + screenSize: 720p + videoCodec: h264 + audioCodec: AC3 + audioChannels: "5.1" + releaseGroup: REDµX + +? The.Director’s.Notebook.2006.Blu-Ray.x264.DXVA.720p.AC3-de[42].mkv +: title: The Director’s Notebook + year: 2006 + format: BluRay + videoCodec: h264 + videoApi: DXVA + screenSize: 720p + audioCodec: AC3 + releaseGroup: de[42] + +? Movies/Cosmopolis.2012.LiMiTED.720p.BluRay.x264-AN0NYM0US[bb]/ano-cosmo.720p.mkv +: title: Cosmopolis + year: 2012 + screenSize: 720p + videoCodec: h264 + releaseGroup: AN0NYM0US[bb] + format: BluRay + other: LIMITED + +? movies/La Science des Rêves (2006)/La.Science.Des.Reves.FRENCH.DVDRip.XviD-MP-AceBot.avi +: title: La Science des Rêves + year: 2006 + format: DVD + videoCodec: XviD + videoProfile: MP + releaseGroup: AceBot + language: French + +? The_Italian_Job.mkv +: title: The Italian Job + +? The.Rum.Diary.2011.1080p.BluRay.DTS.x264.D-Z0N3.mkv +: title: The Rum Diary + year: 2011 + screenSize: 1080p + format: BluRay + videoCodec: h264 + audioCodec: DTS + releaseGroup: D-Z0N3 + +? Life.Of.Pi.2012.1080p.BluRay.DTS.x264.D-Z0N3.mkv +: title: Life Of Pi + year: 2012 + screenSize: 1080p + format: BluRay + videoCodec: h264 + audioCodec: DTS + releaseGroup: D-Z0N3 + +? The.Kings.Speech.2010.1080p.BluRay.DTS.x264.D Z0N3.mkv +: title: The Kings Speech + year: 2010 + screenSize: 1080p + format: BluRay + audioCodec: DTS + videoCodec: h264 + releaseGroup: D Z0N3 + +? Street.Kings.2008.BluRay.1080p.DTS.x264.dxva EuReKA.mkv +: title: Street Kings + year: 2008 + format: BluRay + screenSize: 1080p + audioCodec: DTS + videoCodec: h264 + videoApi: DXVA + releaseGroup: EuReKa + +? 2001.A.Space.Odyssey.1968.HDDVD.1080p.DTS.x264.dxva EuReKA.mkv +: title: 2001 A Space Odyssey + year: 1968 + format: HD-DVD + screenSize: 1080p + audioCodec: DTS + videoCodec: h264 + videoApi: DXVA + releaseGroup: EuReKa + +? 2012.2009.720p.BluRay.x264.DTS WiKi.mkv +: title: "2012" + year: 2009 + screenSize: 720p + format: BluRay + videoCodec: h264 + audioCodec: DTS + releaseGroup: WiKi + +? /share/Download/movie/Dead Man Down (2013) BRRiP XViD DD5_1 Custom NLSubs =-_lt Q_o_Q gt-=_/XD607ebb-BRc59935-5155473f-1c5f49/XD607ebb-BRc59935-5155473f-1c5f49.avi +: title: Dead Man Down + year: 2013 + format: BluRay + videoCodec: XviD + audioChannels: "5.1" + audioCodec: DolbyDigital + idNumber: XD607ebb-BRc59935-5155473f-1c5f49 + +? Pacific.Rim.3D.2013.COMPLETE.BLURAY-PCH.avi +: title: Pacific Rim + year: 2013 + format: BluRay + other: + - complete + - 3D + releaseGroup: PCH + +? Immersion.French.2011.STV.READNFO.QC.FRENCH.ENGLISH.NTSC.DVDR.nfo +: title: Immersion French + year: 2011 + language: + - French + - English + format: DVD + +? Immersion.French.2011.STV.READNFO.QC.FRENCH.NTSC.DVDR.nfo +: title: Immersion French + year: 2011 + language: French + format: DVD + +? Immersion.French.2011.STV.READNFO.QC.NTSC.DVDR.nfo +: title: Immersion French + year: 2011 + format: DVD + +? French.Immersion.2011.STV.READNFO.QC.ENGLISH.NTSC.DVDR.nfo +: title: French Immersion + year: 2011 + language: ENGLISH + format: DVD + +? Howl's_Moving_Castle_(2004)_[720p,HDTV,x264,DTS]-FlexGet.avi +: videoCodec: h264 + format: HDTV + title: Howl's Moving Castle + screenSize: 720p + year: 2004 + audioCodec: DTS + releaseGroup: FlexGet + +? Pirates de langkasuka.2008.FRENCH.1920X1080.h264.AVC.AsiaRa.mkv +: screenSize: 1080p + year: 2008 + language: French + videoCodec: h264 + title: Pirates de langkasuka + releaseGroup: AsiaRa + +? Masala (2013) Telugu Movie HD DVDScr XviD - Exclusive.avi +: year: 2013 + videoCodec: XviD + title: Masala + format: HD-DVD + other: screener + language: Telugu + releaseGroup: Exclusive + +? Django Unchained 2012 DVDSCR X264 AAC-P2P.nfo +: year: 2012 + other: screener + videoCodec: h264 + title: Django Unchained + audioCodec: AAC + format: DVD + releaseGroup: P2P + +? Ejecutiva.En.Apuros(2009).BLURAY.SCR.Xvid.Spanish.LanzamientosD.nfo +: year: 2009 + other: screener + format: BluRay + videoCodec: XviD + language: Spanish + title: Ejecutiva En Apuros + +? Die.Schluempfe.2.German.DL.1080p.BluRay.x264-EXQUiSiTE.mkv +: title: Die Schluempfe 2 + format: BluRay + language: + - Multiple languages + - German + videoCodec: h264 + releaseGroup: EXQUiSiTE + screenSize: 1080p + +? Rocky 1976 French SubForced BRRip x264 AC3-FUNKY.mkv +: title: Rocky + year: 1976 + subtitleLanguage: French + format: BluRay + videoCodec: h264 + audioCodec: AC3 + releaseGroup: FUNKY + +? REDLINE (BD 1080p H264 10bit FLAC) [3xR].mkv +: title: REDLINE + format: BluRay + videoCodec: h264 + videoProfile: 10bit + audioCodec: Flac + screenSize: 1080p + +? The.Lizzie.McGuire.Movie.(2003).HR.DVDRiP.avi +: title: The Lizzie McGuire Movie + year: 2003 + format: DVD + other: HR + +? Hua.Mulan.BRRIP.MP4.x264.720p-HR.avi +: title: Hua Mulan + videoCodec: h264 + format: BluRay + screenSize: 720p + other: HR + +? Dr.Seuss.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 +: videoCodec: XviD + title: Dr Seuss The Lorax + format: DVD + other: LiNE + year: 2012 + audioCodec: AC3 + audioProfile: HQ + releaseGroup: Hive-CM8 + + +? "Star Wars: Episode IV - A New Hope (2004) Special Edition.MKV" +: title: Star Wars Episode IV + year: 2004 + edition: Special Edition + +? Dr.LiNE.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 +: videoCodec: XviD + title: Dr LiNE The Lorax + format: DVD + other: LiNE + year: 2012 + audioCodec: AC3 + audioProfile: HQ + releaseGroup: Hive-CM8 + +? Perfect Child-2007-TRUEFRENCH-TVRip.Xvid-h@mster.avi +: releaseGroup: h@mster + title: Perfect Child + videoCodec: XviD + language: French + format: TV + year: 2007 + +? entre.ciel.et.terre.(1994).dvdrip.h264.aac-psypeon.avi +: audioCodec: AAC + format: DVD + releaseGroup: psypeon + title: entre ciel et terre + videoCodec: h264 + year: 1994 + +? Yves.Saint.Laurent.2013.FRENCH.DVDSCR.MD.XviD-ViVARiUM.avi +: format: DVD + language: French + other: Screener + releaseGroup: ViVARiUM + title: Yves Saint Laurent + videoCodec: XviD + year: 2013 + +? Echec et Mort - Hard to Kill - Steven Seagal Multi 1080p BluRay x264 CCATS.avi +: format: BluRay + language: Multiple languages + releaseGroup: CCATS + screenSize: 1080p + title: Echec et Mort + videoCodec: h264 + +? Paparazzi - Timsit/Lindon (MKV 1080p tvripHD) +: options: -n + title: Paparazzi + screenSize: 1080p + format: HDTV + +? some.movie.720p.bluray.x264-mind +: options: -n + title: some movie + screenSize: 720p + videoCodec: h264 + releaseGroup: mind + format: BluRay + +? Dr LiNE The Lorax 720p h264 BluRay +: options: -n + title: Dr LiNE The Lorax + screenSize: 720p + videoCodec: h264 + format: BluRay + +? BeatdownFrenchDVDRip.mkv +: options: -c + title: Beatdown + language: French + format: DVD + +? YvesSaintLaurent2013FrenchDVDScrXvid.avi +: options: -c + format: DVD + language: French + other: Screener + title: Yves saint laurent + videoCodec: XviD + year: 2013 + +? Elle.s.en.va.720p.mkv +: screenSize: 720p + title: Elle s en va + +? FooBar.7.PDTV-FlexGet +: options: -n + format: DVB + releaseGroup: FlexGet + title: FooBar 7 + +? h265 - HEVC Riddick Unrated Director Cut French 1080p DTS.mkv +: audioCodec: DTS + edition: Director's cut + language: fr + screenSize: 1080p + title: Riddick Unrated + videoCodec: h265 + +? "[h265 - HEVC] Riddick Unrated Director Cut French [1080p DTS].mkv" +: audioCodec: DTS + edition: Director's cut + language: fr + screenSize: 1080p + title: Riddick Unrated + videoCodec: h265 + +? Barbecue-2014-French-mHD-1080p +: options: -n + language: fr + other: mHD + screenSize: 1080p + title: Barbecue + year: 2014 + +? Underworld Quadrilogie VO+VFF+VFQ 1080p HDlight.x264~Tonyk~Monde Infernal +: options: -n + language: + - fr + - vo + other: HDLight + screenSize: 1080p + title: Underworld Quadrilogie + videoCodec: h264 + +? A Bout Portant (The Killers).PAL.Multi.DVD-R-KZ +: options: -n + format: DVD + language: mul + releaseGroup: KZ + title: A Bout Portant + +? "Mise à Sac (Alain Cavalier, 1967) [Vhs.Rip.Vff]" +: options: -n + format: VHS + language: fr + title: "Mise à Sac" + year: 1967 + +? A Bout Portant (The Killers).PAL.Multi.DVD-R-KZ +: options: -n + format: DVD + language: mul + releaseGroup: KZ + title: A Bout Portant + +? Youth.In.Revolt.(Be.Bad).2009.MULTI.1080p.LAME3*92-MEDIOZZ +: options: -n + audioCodec: MP3 + language: mul + releaseGroup: MEDIOZZ + screenSize: 1080p + title: Youth In Revolt + year: 2009 + +? La Defense Lincoln (The Lincoln Lawyer) 2011 [DVDRIP][Vostfr] +: options: -n + format: DVD + subtitleLanguage: fr + title: La Defense Lincoln + year: 2011 + +? '[h265 - HEVC] Fight Club French 1080p DTS.' +: options: -n + audioCodec: DTS + language: fr + screenSize: 1080p + title: Fight Club + videoCodec: h265 + +? Love Gourou (Mike Myers) - FR +: options: -n + language: fr + title: Love Gourou + +? '[h265 - hevc] transformers 2 1080p french ac3 6ch.' +: options: -n + audioChannels: '5.1' + audioCodec: AC3 + language: fr + screenSize: 1080p + title: transformers 2 + videoCodec: h265 diff --git a/libs/guessit/test/opensubtitles_languages_2012_05_09.txt b/libs/guessit/test/opensubtitles_languages_2012_05_09.txt new file mode 100644 index 00000000..4a08d9b5 --- /dev/null +++ b/libs/guessit/test/opensubtitles_languages_2012_05_09.txt @@ -0,0 +1,473 @@ +IdSubLanguage ISO639 LanguageName UploadEnabled WebEnabled +aar aa Afar, afar 0 0 +abk ab Abkhazian 0 0 +ace Achinese 0 0 +ach Acoli 0 0 +ada Adangme 0 0 +ady adyghé 0 0 +afa Afro-Asiatic (Other) 0 0 +afh Afrihili 0 0 +afr af Afrikaans 0 0 +ain Ainu 0 0 +aka ak Akan 0 0 +akk Akkadian 0 0 +alb sq Albanian 1 1 +ale Aleut 0 0 +alg Algonquian languages 0 0 +alt Southern Altai 0 0 +amh am Amharic 0 0 +ang English, Old (ca.450-1100) 0 0 +apa Apache languages 0 0 +ara ar Arabic 1 1 +arc Aramaic 0 0 +arg an Aragonese 0 0 +arm hy Armenian 1 0 +arn Araucanian 0 0 +arp Arapaho 0 0 +art Artificial (Other) 0 0 +arw Arawak 0 0 +asm as Assamese 0 0 +ast Asturian, Bable 0 0 +ath Athapascan languages 0 0 +aus Australian languages 0 0 +ava av Avaric 0 0 +ave ae Avestan 0 0 +awa Awadhi 0 0 +aym ay Aymara 0 0 +aze az Azerbaijani 0 0 +bad Banda 0 0 +bai Bamileke languages 0 0 +bak ba Bashkir 0 0 +bal Baluchi 0 0 +bam bm Bambara 0 0 +ban Balinese 0 0 +baq eu Basque 1 1 +bas Basa 0 0 +bat Baltic (Other) 0 0 +bej Beja 0 0 +bel be Belarusian 0 0 +bem Bemba 0 0 +ben bn Bengali 1 0 +ber Berber (Other) 0 0 +bho Bhojpuri 0 0 +bih bh Bihari 0 0 +bik Bikol 0 0 +bin Bini 0 0 +bis bi Bislama 0 0 +bla Siksika 0 0 +bnt Bantu (Other) 0 0 +bos bs Bosnian 1 0 +bra Braj 0 0 +bre br Breton 1 0 +btk Batak (Indonesia) 0 0 +bua Buriat 0 0 +bug Buginese 0 0 +bul bg Bulgarian 1 1 +bur my Burmese 0 0 +byn Blin 0 0 +cad Caddo 0 0 +cai Central American Indian (Other) 0 0 +car Carib 0 0 +cat ca Catalan 1 1 +cau Caucasian (Other) 0 0 +ceb Cebuano 0 0 +cel Celtic (Other) 0 0 +cha ch Chamorro 0 0 +chb Chibcha 0 0 +che ce Chechen 0 0 +chg Chagatai 0 0 +chi zh Chinese 1 1 +chk Chuukese 0 0 +chm Mari 0 0 +chn Chinook jargon 0 0 +cho Choctaw 0 0 +chp Chipewyan 0 0 +chr Cherokee 0 0 +chu cu Church Slavic 0 0 +chv cv Chuvash 0 0 +chy Cheyenne 0 0 +cmc Chamic languages 0 0 +cop Coptic 0 0 +cor kw Cornish 0 0 +cos co Corsican 0 0 +cpe Creoles and pidgins, English based (Other) 0 0 +cpf Creoles and pidgins, French-based (Other) 0 0 +cpp Creoles and pidgins, Portuguese-based (Other) 0 0 +cre cr Cree 0 0 +crh Crimean Tatar 0 0 +crp Creoles and pidgins (Other) 0 0 +csb Kashubian 0 0 +cus Cushitic (Other)' couchitiques, autres langues 0 0 +cze cs Czech 1 1 +dak Dakota 0 0 +dan da Danish 1 1 +dar Dargwa 0 0 +day Dayak 0 0 +del Delaware 0 0 +den Slave (Athapascan) 0 0 +dgr Dogrib 0 0 +din Dinka 0 0 +div dv Divehi 0 0 +doi Dogri 0 0 +dra Dravidian (Other) 0 0 +dua Duala 0 0 +dum Dutch, Middle (ca.1050-1350) 0 0 +dut nl Dutch 1 1 +dyu Dyula 0 0 +dzo dz Dzongkha 0 0 +efi Efik 0 0 +egy Egyptian (Ancient) 0 0 +eka Ekajuk 0 0 +elx Elamite 0 0 +eng en English 1 1 +enm English, Middle (1100-1500) 0 0 +epo eo Esperanto 1 0 +est et Estonian 1 1 +ewe ee Ewe 0 0 +ewo Ewondo 0 0 +fan Fang 0 0 +fao fo Faroese 0 0 +fat Fanti 0 0 +fij fj Fijian 0 0 +fil Filipino 0 0 +fin fi Finnish 1 1 +fiu Finno-Ugrian (Other) 0 0 +fon Fon 0 0 +fre fr French 1 1 +frm French, Middle (ca.1400-1600) 0 0 +fro French, Old (842-ca.1400) 0 0 +fry fy Frisian 0 0 +ful ff Fulah 0 0 +fur Friulian 0 0 +gaa Ga 0 0 +gay Gayo 0 0 +gba Gbaya 0 0 +gem Germanic (Other) 0 0 +geo ka Georgian 1 1 +ger de German 1 1 +gez Geez 0 0 +gil Gilbertese 0 0 +gla gd Gaelic 0 0 +gle ga Irish 0 0 +glg gl Galician 1 1 +glv gv Manx 0 0 +gmh German, Middle High (ca.1050-1500) 0 0 +goh German, Old High (ca.750-1050) 0 0 +gon Gondi 0 0 +gor Gorontalo 0 0 +got Gothic 0 0 +grb Grebo 0 0 +grc Greek, Ancient (to 1453) 0 0 +ell el Greek 1 1 +grn gn Guarani 0 0 +guj gu Gujarati 0 0 +gwi Gwich´in 0 0 +hai Haida 0 0 +hat ht Haitian 0 0 +hau ha Hausa 0 0 +haw Hawaiian 0 0 +heb he Hebrew 1 1 +her hz Herero 0 0 +hil Hiligaynon 0 0 +him Himachali 0 0 +hin hi Hindi 1 1 +hit Hittite 0 0 +hmn Hmong 0 0 +hmo ho Hiri Motu 0 0 +hrv hr Croatian 1 1 +hun hu Hungarian 1 1 +hup Hupa 0 0 +iba Iban 0 0 +ibo ig Igbo 0 0 +ice is Icelandic 1 1 +ido io Ido 0 0 +iii ii Sichuan Yi 0 0 +ijo Ijo 0 0 +iku iu Inuktitut 0 0 +ile ie Interlingue 0 0 +ilo Iloko 0 0 +ina ia Interlingua (International Auxiliary Language Asso 0 0 +inc Indic (Other) 0 0 +ind id Indonesian 1 1 +ine Indo-European (Other) 0 0 +inh Ingush 0 0 +ipk ik Inupiaq 0 0 +ira Iranian (Other) 0 0 +iro Iroquoian languages 0 0 +ita it Italian 1 1 +jav jv Javanese 0 0 +jpn ja Japanese 1 1 +jpr Judeo-Persian 0 0 +jrb Judeo-Arabic 0 0 +kaa Kara-Kalpak 0 0 +kab Kabyle 0 0 +kac Kachin 0 0 +kal kl Kalaallisut 0 0 +kam Kamba 0 0 +kan kn Kannada 0 0 +kar Karen 0 0 +kas ks Kashmiri 0 0 +kau kr Kanuri 0 0 +kaw Kawi 0 0 +kaz kk Kazakh 1 0 +kbd Kabardian 0 0 +kha Khasi 0 0 +khi Khoisan (Other) 0 0 +khm km Khmer 1 1 +kho Khotanese 0 0 +kik ki Kikuyu 0 0 +kin rw Kinyarwanda 0 0 +kir ky Kirghiz 0 0 +kmb Kimbundu 0 0 +kok Konkani 0 0 +kom kv Komi 0 0 +kon kg Kongo 0 0 +kor ko Korean 1 1 +kos Kosraean 0 0 +kpe Kpelle 0 0 +krc Karachay-Balkar 0 0 +kro Kru 0 0 +kru Kurukh 0 0 +kua kj Kuanyama 0 0 +kum Kumyk 0 0 +kur ku Kurdish 0 0 +kut Kutenai 0 0 +lad Ladino 0 0 +lah Lahnda 0 0 +lam Lamba 0 0 +lao lo Lao 0 0 +lat la Latin 0 0 +lav lv Latvian 1 0 +lez Lezghian 0 0 +lim li Limburgan 0 0 +lin ln Lingala 0 0 +lit lt Lithuanian 1 0 +lol Mongo 0 0 +loz Lozi 0 0 +ltz lb Luxembourgish 1 0 +lua Luba-Lulua 0 0 +lub lu Luba-Katanga 0 0 +lug lg Ganda 0 0 +lui Luiseno 0 0 +lun Lunda 0 0 +luo Luo (Kenya and Tanzania) 0 0 +lus lushai 0 0 +mac mk Macedonian 1 1 +mad Madurese 0 0 +mag Magahi 0 0 +mah mh Marshallese 0 0 +mai Maithili 0 0 +mak Makasar 0 0 +mal ml Malayalam 0 0 +man Mandingo 0 0 +mao mi Maori 0 0 +map Austronesian (Other) 0 0 +mar mr Marathi 0 0 +mas Masai 0 0 +may ms Malay 1 1 +mdf Moksha 0 0 +mdr Mandar 0 0 +men Mende 0 0 +mga Irish, Middle (900-1200) 0 0 +mic Mi'kmaq 0 0 +min Minangkabau 0 0 +mis Miscellaneous languages 0 0 +mkh Mon-Khmer (Other) 0 0 +mlg mg Malagasy 0 0 +mlt mt Maltese 0 0 +mnc Manchu 0 0 +mni Manipuri 0 0 +mno Manobo languages 0 0 +moh Mohawk 0 0 +mol mo Moldavian 0 0 +mon mn Mongolian 1 0 +mos Mossi 0 0 +mwl Mirandese 0 0 +mul Multiple languages 0 0 +mun Munda languages 0 0 +mus Creek 0 0 +mwr Marwari 0 0 +myn Mayan languages 0 0 +myv Erzya 0 0 +nah Nahuatl 0 0 +nai North American Indian 0 0 +nap Neapolitan 0 0 +nau na Nauru 0 0 +nav nv Navajo 0 0 +nbl nr Ndebele, South 0 0 +nde nd Ndebele, North 0 0 +ndo ng Ndonga 0 0 +nds Low German 0 0 +nep ne Nepali 0 0 +new Nepal Bhasa 0 0 +nia Nias 0 0 +nic Niger-Kordofanian (Other) 0 0 +niu Niuean 0 0 +nno nn Norwegian Nynorsk 0 0 +nob nb Norwegian Bokmal 0 0 +nog Nogai 0 0 +non Norse, Old 0 0 +nor no Norwegian 1 1 +nso Northern Sotho 0 0 +nub Nubian languages 0 0 +nwc Classical Newari 0 0 +nya ny Chichewa 0 0 +nym Nyamwezi 0 0 +nyn Nyankole 0 0 +nyo Nyoro 0 0 +nzi Nzima 0 0 +oci oc Occitan 1 1 +oji oj Ojibwa 0 0 +ori or Oriya 0 0 +orm om Oromo 0 0 +osa Osage 0 0 +oss os Ossetian 0 0 +ota Turkish, Ottoman (1500-1928) 0 0 +oto Otomian languages 0 0 +paa Papuan (Other) 0 0 +pag Pangasinan 0 0 +pal Pahlavi 0 0 +pam Pampanga 0 0 +pan pa Panjabi 0 0 +pap Papiamento 0 0 +pau Palauan 0 0 +peo Persian, Old (ca.600-400 B.C.) 0 0 +per fa Persian 1 1 +phi Philippine (Other) 0 0 +phn Phoenician 0 0 +pli pi Pali 0 0 +pol pl Polish 1 1 +pon Pohnpeian 0 0 +por pt Portuguese 1 1 +pra Prakrit languages 0 0 +pro Provençal, Old (to 1500) 0 0 +pus ps Pushto 0 0 +que qu Quechua 0 0 +raj Rajasthani 0 0 +rap Rapanui 0 0 +rar Rarotongan 0 0 +roa Romance (Other) 0 0 +roh rm Raeto-Romance 0 0 +rom Romany 0 0 +run rn Rundi 0 0 +rup Aromanian 0 0 +rus ru Russian 1 1 +sad Sandawe 0 0 +sag sg Sango 0 0 +sah Yakut 0 0 +sai South American Indian (Other) 0 0 +sal Salishan languages 0 0 +sam Samaritan Aramaic 0 0 +san sa Sanskrit 0 0 +sas Sasak 0 0 +sat Santali 0 0 +scc sr Serbian 1 1 +scn Sicilian 0 0 +sco Scots 0 0 +sel Selkup 0 0 +sem Semitic (Other) 0 0 +sga Irish, Old (to 900) 0 0 +sgn Sign Languages 0 0 +shn Shan 0 0 +sid Sidamo 0 0 +sin si Sinhalese 1 1 +sio Siouan languages 0 0 +sit Sino-Tibetan (Other) 0 0 +sla Slavic (Other) 0 0 +slo sk Slovak 1 1 +slv sl Slovenian 1 1 +sma Southern Sami 0 0 +sme se Northern Sami 0 0 +smi Sami languages (Other) 0 0 +smj Lule Sami 0 0 +smn Inari Sami 0 0 +smo sm Samoan 0 0 +sms Skolt Sami 0 0 +sna sn Shona 0 0 +snd sd Sindhi 0 0 +snk Soninke 0 0 +sog Sogdian 0 0 +som so Somali 0 0 +son Songhai 0 0 +sot st Sotho, Southern 0 0 +spa es Spanish 1 1 +srd sc Sardinian 0 0 +srr Serer 0 0 +ssa Nilo-Saharan (Other) 0 0 +ssw ss Swati 0 0 +suk Sukuma 0 0 +sun su Sundanese 0 0 +sus Susu 0 0 +sux Sumerian 0 0 +swa sw Swahili 1 0 +swe sv Swedish 1 1 +syr Syriac 1 0 +tah ty Tahitian 0 0 +tai Tai (Other) 0 0 +tam ta Tamil 0 0 +tat tt Tatar 0 0 +tel te Telugu 0 0 +tem Timne 0 0 +ter Tereno 0 0 +tet Tetum 0 0 +tgk tg Tajik 0 0 +tgl tl Tagalog 1 1 +tha th Thai 1 1 +tib bo Tibetan 0 0 +tig Tigre 0 0 +tir ti Tigrinya 0 0 +tiv Tiv 0 0 +tkl Tokelau 0 0 +tlh Klingon 0 0 +tli Tlingit 0 0 +tmh Tamashek 0 0 +tog Tonga (Nyasa) 0 0 +ton to Tonga (Tonga Islands) 0 0 +tpi Tok Pisin 0 0 +tsi Tsimshian 0 0 +tsn tn Tswana 0 0 +tso ts Tsonga 0 0 +tuk tk Turkmen 0 0 +tum Tumbuka 0 0 +tup Tupi languages 0 0 +tur tr Turkish 1 1 +tut Altaic (Other) 0 0 +tvl Tuvalu 0 0 +twi tw Twi 0 0 +tyv Tuvinian 0 0 +udm Udmurt 0 0 +uga Ugaritic 0 0 +uig ug Uighur 0 0 +ukr uk Ukrainian 1 1 +umb Umbundu 0 0 +und Undetermined 0 0 +urd ur Urdu 1 0 +uzb uz Uzbek 0 0 +vai Vai 0 0 +ven ve Venda 0 0 +vie vi Vietnamese 1 1 +vol vo Volapük 0 0 +vot Votic 0 0 +wak Wakashan languages 0 0 +wal Walamo 0 0 +war Waray 0 0 +was Washo 0 0 +wel cy Welsh 0 0 +wen Sorbian languages 0 0 +wln wa Walloon 0 0 +wol wo Wolof 0 0 +xal Kalmyk 0 0 +xho xh Xhosa 0 0 +yao Yao 0 0 +yap Yapese 0 0 +yid yi Yiddish 0 0 +yor yo Yoruba 0 0 +ypk Yupik languages 0 0 +zap Zapotec 0 0 +zen Zenaga 0 0 +zha za Zhuang 0 0 +znd Zande 0 0 +zul zu Zulu 0 0 +zun Zuni 0 0 +rum ro Romanian 1 1 +pob pb Brazilian 1 1 diff --git a/libs/guessit/test/test_api.py b/libs/guessit/test/test_api.py new file mode 100644 index 00000000..92cef41b --- /dev/null +++ b/libs/guessit/test/test_api.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2014 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + + +class TestApi(TestGuessit): + def test_api(self): + movie_path = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv' + + movie_info = guessit.guess_movie_info(movie_path) + video_info = guessit.guess_video_info(movie_path) + episode_info = guessit.guess_episode_info(movie_path) + file_info = guessit.guess_file_info(movie_path) + + self.assertEqual(guessit.guess_file_info(movie_path, type='movie'), movie_info) + self.assertEqual(guessit.guess_file_info(movie_path, type='video'), video_info) + self.assertEqual(guessit.guess_file_info(movie_path, type='episode'), episode_info) + + self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'movie'}), movie_info) + self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'video'}), video_info) + self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'episode'}), episode_info) + + self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'episode'}, type='movie'), episode_info) # kwargs priority other options + + movie_path_name_only = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD' + file_info_name_only = guessit.guess_file_info(movie_path_name_only, options={"name_only": True}) + + self.assertFalse('container' in file_info_name_only) + self.assertTrue('container' in file_info) + +suite = allTests(TestApi) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_autodetect.py b/libs/guessit/test/test_autodetect.py new file mode 100644 index 00000000..229b491f --- /dev/null +++ b/libs/guessit/test/test_autodetect.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + + +class TestAutoDetect(TestGuessit): + def testEmpty(self): + result = guessit.guess_file_info('') + self.assertEqual(result, {}) + + result = guessit.guess_file_info('___-__') + self.assertEqual(result, {}) + + result = guessit.guess_file_info('__-.avc') + self.assertEqual(result, {'type': 'unknown', 'extension': 'avc'}) + + def testAutoDetect(self): + self.checkMinimumFieldsCorrect(filename='autodetect.yaml', + remove_type=False) + + +suite = allTests(TestAutoDetect) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_autodetect_all.py b/libs/guessit/test/test_autodetect_all.py new file mode 100644 index 00000000..033e1571 --- /dev/null +++ b/libs/guessit/test/test_autodetect_all.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + +IGNORE_EPISODES = [] +IGNORE_MOVIES = [] + + +class TestAutoDetectAll(TestGuessit): + def testAutoMatcher(self): + self.checkMinimumFieldsCorrect(filename='autodetect.yaml', + remove_type=False) + + def testAutoMatcherMovies(self): + self.checkMinimumFieldsCorrect(filename='movies.yaml', + exclude_files=IGNORE_MOVIES) + + def testAutoMatcherEpisodes(self): + self.checkMinimumFieldsCorrect(filename='episodes.yaml', + exclude_files=IGNORE_EPISODES) + + +suite = allTests(TestAutoDetectAll) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_doctests.py b/libs/guessit/test/test_doctests.py new file mode 100644 index 00000000..9fedeb0f --- /dev/null +++ b/libs/guessit/test/test_doctests.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2014 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * +import guessit +import guessit.hash_ed2k +import unittest +import doctest + + +def load_tests(loader, tests, ignore): + tests.addTests(doctest.DocTestSuite(guessit)) + tests.addTests(doctest.DocTestSuite(guessit.date)) + tests.addTests(doctest.DocTestSuite(guessit.fileutils)) + tests.addTests(doctest.DocTestSuite(guessit.guess)) + tests.addTests(doctest.DocTestSuite(guessit.hash_ed2k)) + tests.addTests(doctest.DocTestSuite(guessit.language)) + tests.addTests(doctest.DocTestSuite(guessit.matchtree)) + tests.addTests(doctest.DocTestSuite(guessit.textutils)) + return tests + +suite = unittest.TestSuite() +load_tests(None, suite, None) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_episode.py b/libs/guessit/test/test_episode.py new file mode 100644 index 00000000..03abf6b0 --- /dev/null +++ b/libs/guessit/test/test_episode.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + + +class TestEpisode(TestGuessit): + def testEpisodes(self): + self.checkMinimumFieldsCorrect(filetype='episode', + filename='episodes.yaml') + + +suite = allTests(TestEpisode) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_hashes.py b/libs/guessit/test/test_hashes.py new file mode 100644 index 00000000..a8bc763c --- /dev/null +++ b/libs/guessit/test/test_hashes.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + + +class TestHashes(TestGuessit): + def test_hashes(self): + hashes = ( + ('hash_mpc', '1MB', u'8542ad406c15c8bd'), # TODO: Check if this value is valid + ('hash_ed2k', '1MB', u'ed2k://|file|1MB|1048576|AA3CC5552A9931A76B61A41D306735F7|/'), # TODO: Check if this value is valid + ('hash_md5', '1MB', u'5d8dcbca8d8ac21766f28797d6c3954c'), + ('hash_sha1', '1MB', u'51d2b8f3248d7ee495b7750c8da5aa3b3819de9d'), + ('hash_md5', 'dummy.srt', u'64de6b5893cac24456c46a935ef9c359'), + ('hash_sha1', 'dummy.srt', u'a703fc0fa4518080505809bf562c6fc6f7b3c98c') + ) + + for hash_type, filename, expected_value in hashes: + guess = guess_file_info(file_in_same_dir(__file__, filename), hash_type) + computed_value = guess.get(hash_type) + self.assertEqual(expected_value, guess.get(hash_type), "Invalid %s for %s: %s != %s" % (hash_type, filename, computed_value, expected_value)) + + +suite = allTests(TestHashes) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_language.py b/libs/guessit/test/test_language.py new file mode 100644 index 00000000..99578fe7 --- /dev/null +++ b/libs/guessit/test/test_language.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + +import io + + +class TestLanguage(TestGuessit): + + def check_languages(self, languages): + for lang1, lang2 in languages.items(): + self.assertEqual(Language.fromguessit(lang1), + Language.fromguessit(lang2)) + + def test_addic7ed(self): + languages = {'English': 'en', + 'English (US)': 'en-US', + 'English (UK)': 'en-UK', + 'Italian': 'it', + 'Portuguese': 'pt', + 'Portuguese (Brazilian)': 'pt-BR', + 'Romanian': 'ro', + 'Español (Latinoamérica)': 'es-MX', + 'Español (España)': 'es-ES', + 'Spanish (Latin America)': 'es-MX', + 'Español': 'es', + 'Spanish': 'es', + 'Spanish (Spain)': 'es-ES', + 'French': 'fr', + 'Greek': 'el', + 'Arabic': 'ar', + 'German': 'de', + 'Croatian': 'hr', + 'Indonesian': 'id', + 'Hebrew': 'he', + 'Russian': 'ru', + 'Turkish': 'tr', + 'Swedish': 'se', + 'Czech': 'cs', + 'Dutch': 'nl', + 'Hungarian': 'hu', + 'Norwegian': 'no', + 'Polish': 'pl', + 'Persian': 'fa'} + + self.check_languages(languages) + + def test_subswiki(self): + languages = {'English (US)': 'en-US', 'English (UK)': 'en-UK', 'English': 'en', + 'French': 'fr', 'Brazilian': 'po', 'Portuguese': 'pt', + 'Español (Latinoamérica)': 'es-MX', 'Español (España)': 'es-ES', + 'Español': 'es', 'Italian': 'it', 'Català': 'ca'} + + self.check_languages(languages) + + def test_tvsubtitles(self): + languages = {'English': 'en', 'Español': 'es', 'French': 'fr', 'German': 'de', + 'Brazilian': 'br', 'Russian': 'ru', 'Ukrainian': 'ua', 'Italian': 'it', + 'Greek': 'gr', 'Arabic': 'ar', 'Hungarian': 'hu', 'Polish': 'pl', + 'Turkish': 'tr', 'Dutch': 'nl', 'Portuguese': 'pt', 'Swedish': 'sv', + 'Danish': 'da', 'Finnish': 'fi', 'Korean': 'ko', 'Chinese': 'cn', + 'Japanese': 'jp', 'Bulgarian': 'bg', 'Czech': 'cz', 'Romanian': 'ro'} + + self.check_languages(languages) + + def test_opensubtitles(self): + opensubtitles_langfile = file_in_same_dir(__file__, 'opensubtitles_languages_2012_05_09.txt') + for l in [u(l).strip() for l in io.open(opensubtitles_langfile, encoding='utf-8')][1:]: + idlang, alpha2, _, upload_enabled, web_enabled = l.strip().split('\t') + # do not test languages that are too esoteric / not widely available + if int(upload_enabled) and int(web_enabled): + # check that we recognize the opensubtitles language code correctly + # and that we are able to output this code from a language + self.assertEqual(idlang, Language.fromguessit(idlang).opensubtitles) + if alpha2: + # check we recognize the opensubtitles 2-letter code correctly + self.check_languages({idlang: alpha2}) + + def test_tmdb(self): + # examples from http://api.themoviedb.org/2.1/language-tags + for lang in ['en-US', 'en-CA', 'es-MX', 'fr-PF']: + self.assertEqual(lang, str(Language.fromguessit(lang))) + + def test_subtitulos(self): + languages = {'English (US)': 'en-US', 'English (UK)': 'en-UK', 'English': 'en', + 'French': 'fr', 'Brazilian': 'po', 'Portuguese': 'pt', + 'Español (Latinoamérica)': 'es-MX', 'Español (España)': 'es-ES', + 'Español': 'es', 'Italian': 'it', 'Català': 'ca'} + + self.check_languages(languages) + + def test_thesubdb(self): + languages = {'af': 'af', 'cs': 'cs', 'da': 'da', 'de': 'de', 'en': 'en', 'es': 'es', 'fi': 'fi', + 'fr': 'fr', 'hu': 'hu', 'id': 'id', 'it': 'it', 'la': 'la', 'nl': 'nl', 'no': 'no', + 'oc': 'oc', 'pl': 'pl', 'pt': 'pt', 'ro': 'ro', 'ru': 'ru', 'sl': 'sl', 'sr': 'sr', + 'sv': 'sv', 'tr': 'tr'} + + self.check_languages(languages) + + def test_exceptions(self): + self.assertEqual(Language.fromguessit('br'), Language.fromguessit('pt(br)')) + + self.assertEqual(Language.fromguessit('unknown'), + Language.fromguessit('und')) + + +suite = allTests(TestLanguage) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_main.py b/libs/guessit/test/test_main.py new file mode 100644 index 00000000..1140654a --- /dev/null +++ b/libs/guessit/test/test_main.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2014 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * +from guessit.fileutils import split_path, file_in_same_dir +from guessit.textutils import strip_brackets, str_replace, str_fill +from guessit import PY2 +from guessit import __main__ + +if PY2: + from StringIO import StringIO +else: + from io import StringIO + + +class TestMain(TestGuessit): + def setUp(self): + self._stdout = sys.stdout + string_out = StringIO() + sys.stdout = string_out + + def tearDown(self): + sys.stdout = self._stdout + + def test_list_properties(self): + __main__.main(["-p"], False) + __main__.main(["-V"], False) + + def test_list_transformers(self): + __main__.main(["--transformers"], False) + __main__.main(["-V", "--transformers"], False) + + def test_demo(self): + __main__.main(["-d"], False) + + def test_filename(self): + __main__.main(["A.Movie.2014.avi"], False) + __main__.main(["A.Movie.2014.avi", "A.2nd.Movie.2014.avi"], False) + __main__.main(["-y", "A.Movie.2014.avi"], False) + __main__.main(["-a", "A.Movie.2014.avi"], False) + __main__.main(["-v", "A.Movie.2014.avi"], False) + __main__.main(["-t", "movie", "A.Movie.2014.avi"], False) + __main__.main(["-t", "episode", "A.Serie.S02E06.avi"], False) + __main__.main(["-i", "hash_mpc", file_in_same_dir(__file__, "1MB")], False) + __main__.main(["-i", "hash_md5", file_in_same_dir(__file__, "1MB")], False) + +suite = allTests(TestMain) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_matchtree.py b/libs/guessit/test/test_matchtree.py new file mode 100644 index 00000000..8712d78f --- /dev/null +++ b/libs/guessit/test/test_matchtree.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + +from guessit.transfo.guess_release_group import GuessReleaseGroup +from guessit.transfo.guess_properties import GuessProperties +from guessit.matchtree import BaseMatchTree + +keywords = yaml.load(""" + +? Xvid PROPER +: videoCodec: Xvid + other: PROPER + +? PROPER-Xvid +: videoCodec: Xvid + other: PROPER + +""") + + +def guess_info(string, options=None): + mtree = MatchTree(string) + GuessReleaseGroup().process(mtree, options) + GuessProperties().process(mtree, options) + return mtree.matched() + + +class TestMatchTree(TestGuessit): + def test_base_tree(self): + t = BaseMatchTree('One Two Three(Three) Four') + t.partition((3, 7, 20)) + leaves = list(t.leaves()) + + self.assertEqual(leaves[0].span, (0, 3)) + + self.assertEqual('One', leaves[0].value) + self.assertEqual(' Two', leaves[1].value) + self.assertEqual(' Three(Three)', leaves[2].value) + self.assertEqual(' Four', leaves[3].value) + + leaves[2].partition((1, 6, 7, 12)) + three_leaves = list(leaves[2].leaves()) + + self.assertEqual('Three', three_leaves[1].value) + self.assertEqual('Three', three_leaves[3].value) + + leaves = list(t.leaves()) + + self.assertEqual(len(leaves), 8) + + self.assertEqual(leaves[5], three_leaves[3]) + + self.assertEqual(t.previous_leaf(leaves[5]), leaves[4]) + self.assertEqual(t.next_leaf(leaves[5]), leaves[6]) + + self.assertEqual(t.next_leaves(leaves[5]), [leaves[6], leaves[7]]) + self.assertEqual(t.previous_leaves(leaves[5]), [leaves[4], leaves[3], leaves[2], leaves[1], leaves[0]]) + + self.assertEqual(t.next_leaf(leaves[7]), None) + self.assertEqual(t.previous_leaf(leaves[0]), None) + + self.assertEqual(t.next_leaves(leaves[7]), []) + self.assertEqual(t.previous_leaves(leaves[0]), []) + + def test_match(self): + self.checkFields(keywords, guess_info) + + +suite = allTests(TestMatchTree) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_movie.py b/libs/guessit/test/test_movie.py new file mode 100644 index 00000000..eecbf49d --- /dev/null +++ b/libs/guessit/test/test_movie.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * + + +class TestMovie(TestGuessit): + def testMovies(self): + self.checkMinimumFieldsCorrect(filetype='movie', + filename='movies.yaml') + + +suite = allTests(TestMovie) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_quality.py b/libs/guessit/test/test_quality.py new file mode 100644 index 00000000..52e21791 --- /dev/null +++ b/libs/guessit/test/test_quality.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.quality import best_quality, best_quality_properties +from guessit.containers import QualitiesContainer +from guessit.test.guessittest import * + + +class TestQuality(TestGuessit): + def test_container(self): + container = QualitiesContainer() + + container.register_quality('color', 'red', 10) + container.register_quality('color', 'orange', 20) + container.register_quality('color', 'green', 30) + + container.register_quality('context', 'sun', 100) + container.register_quality('context', 'sea', 200) + container.register_quality('context', 'sex', 300) + + g1 = Guess() + g1['color'] = 'red' + + g2 = Guess() + g2['color'] = 'green' + + g3 = Guess() + g3['color'] = 'orange' + + q3 = container.rate_quality(g3) + self.assertEqual(q3, 20, "ORANGE should be rated 20. Don't ask why!") + + q1 = container.rate_quality(g1) + q2 = container.rate_quality(g2) + + self.assertTrue(q2 > q1, "GREEN should be greater than RED. Don't ask why!") + + g1['context'] = 'sex' + g2['context'] = 'sun' + + q1 = container.rate_quality(g1) + q2 = container.rate_quality(g2) + + self.assertTrue(q1 > q2, "SEX should be greater than SUN. Don't ask why!") + + self.assertEqual(container.best_quality(g1, g2), g1, "RED&SEX should be better than GREEN&SUN. Don't ask why!") + + self.assertEqual(container.best_quality_properties(['color'], g1, g2), g2, "GREEN should be better than RED. Don't ask why!") + + self.assertEqual(container.best_quality_properties(['context'], g1, g2), g1, "SEX should be better than SUN. Don't ask why!") + + q1 = container.rate_quality(g1, 'color') + q2 = container.rate_quality(g2, 'color') + + self.assertTrue(q2 > q1, "GREEN should be greater than RED. Don't ask why!") + + container.unregister_quality('context', 'sex') + container.unregister_quality('context', 'sun') + + q1 = container.rate_quality(g1) + q2 = container.rate_quality(g2) + + self.assertTrue(q2 > q1, "GREEN&SUN should be greater than RED&SEX. Don't ask why!") + + g3['context'] = 'sea' + container.unregister_quality('context', 'sea') + + q3 = container.rate_quality(g3, 'context') + self.assertEqual(q3, 0, "Context should be unregistered.") + + container.unregister_quality('color') + q3 = container.rate_quality(g3, 'color') + + self.assertEqual(q3, 0, "Color should be unregistered.") + + container.clear_qualities() + + q1 = container.rate_quality(g1) + q2 = container.rate_quality(g2) + + self.assertTrue(q1 == q2 == 0, "Empty quality container should rate each guess to 0") + + def test_quality_transformers(self): + guess_720p = guessit.guess_file_info("2012.2009.720p.BluRay.x264.DTS WiKi.mkv") + guess_1080p = guessit.guess_file_info("2012.2009.1080p.BluRay.x264.MP3 WiKi.mkv") + + self.assertTrue('audioCodec' in guess_720p, "audioCodec should be present") + self.assertTrue('audioCodec' in guess_1080p, "audioCodec should be present") + self.assertTrue('screenSize' in guess_720p, "screenSize should be present") + self.assertTrue('screenSize' in guess_1080p, "screenSize should be present") + + best_quality_guess = best_quality(guess_720p, guess_1080p) + + self.assertTrue(guess_1080p == best_quality_guess, "1080p+MP3 is not the best global quality") + + best_quality_guess = best_quality_properties(['screenSize'], guess_720p, guess_1080p) + + self.assertTrue(guess_1080p == best_quality_guess, "1080p is not the best screenSize") + + best_quality_guess = best_quality_properties(['audioCodec'], guess_720p, guess_1080p) + + self.assertTrue(guess_720p == best_quality_guess, "DTS is not the best audioCodec") + +suite = allTests(TestQuality) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_utils.py b/libs/guessit/test/test_utils.py new file mode 100644 index 00000000..87eecb98 --- /dev/null +++ b/libs/guessit/test/test_utils.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.test.guessittest import * +from guessit.fileutils import split_path +from guessit.textutils import strip_brackets, str_replace, str_fill, from_camel, is_camel,\ + levenshtein, reorder_title +from guessit import PY2 +from guessit.date import search_date, search_year +from datetime import datetime, date, timedelta + + +class TestUtils(TestGuessit): + def test_splitpath(self): + alltests = {False: {'/usr/bin/smewt': ['/', 'usr', 'bin', 'smewt'], + 'relative_path/to/my_folder/': ['relative_path', 'to', 'my_folder'], + '//some/path': ['//', 'some', 'path'], + '//some//path': ['//', 'some', 'path'], + '///some////path': ['///', 'some', 'path'] + + }, + True: {'C:\\Program Files\\Smewt\\smewt.exe': ['C:\\', 'Program Files', 'Smewt', 'smewt.exe'], + 'Documents and Settings\\User\\config': ['Documents and Settings', 'User', 'config'], + 'C:\\Documents and Settings\\User\\config': ['C:\\', 'Documents and Settings', 'User', 'config'], + # http://bugs.python.org/issue19945 + '\\\\netdrive\\share': ['\\\\', 'netdrive', 'share'] if PY2 else ['\\\\netdrive\\share'], + '\\\\netdrive\\share\\folder': ['\\\\', 'netdrive', 'share', 'folder'] if PY2 else ['\\\\netdrive\\share\\', 'folder'], + } + } + tests = alltests[sys.platform == 'win32'] + for path, split in tests.items(): + self.assertEqual(split, split_path(path)) + + def test_strip_brackets(self): + allTests = (('', ''), + ('[test]', 'test'), + ('{test2}', 'test2'), + ('(test3)', 'test3'), + ('(test4]', '(test4]'), + ) + + for i, e in allTests: + self.assertEqual(e, strip_brackets(i)) + + def test_levenshtein(self): + self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmno"), 0) + self.assertEqual(levenshtein("abcdef ghijk lmnop", "abcdef ghijk lmno"), 1) + self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmn"), 1) + self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmnp"), 1) + self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmnq"), 1) + self.assertEqual(levenshtein("cbcdef ghijk lmno", "abcdef ghijk lmnq"), 2) + self.assertEqual(levenshtein("cbcdef ghihk lmno", "abcdef ghijk lmnq"), 3) + + def test_reorder_title(self): + self.assertEqual(reorder_title("Simpsons, The"), "The Simpsons") + self.assertEqual(reorder_title("Simpsons,The"), "The Simpsons") + self.assertEqual(reorder_title("Simpsons,Les", articles=('the', 'le', 'la', 'les')), "Les Simpsons") + self.assertEqual(reorder_title("Simpsons, Les", articles=('the', 'le', 'la', 'les')), "Les Simpsons") + + def test_camel(self): + self.assertEqual("", from_camel("")) + + self.assertEqual("Hello world", str_replace("Hello World", 6, 'w')) + self.assertEqual("Hello *****", str_fill("Hello World", (6, 11), '*')) + + self.assertTrue("This is camel", from_camel("ThisIsCamel")) + + self.assertEqual('camel case', from_camel('camelCase')) + self.assertEqual('A case', from_camel('ACase')) + self.assertEqual('MiXedCaSe is not camel case', from_camel('MiXedCaSe is not camelCase')) + + self.assertEqual("This is camel cased title", from_camel("ThisIsCamelCasedTitle")) + self.assertEqual("This is camel CASED title", from_camel("ThisIsCamelCASEDTitle")) + + self.assertEqual("These are camel CASED title", from_camel("TheseAreCamelCASEDTitle")) + + self.assertEqual("Give a camel case string", from_camel("GiveACamelCaseString")) + + self.assertEqual("Death TO camel case", from_camel("DeathTOCamelCase")) + self.assertEqual("But i like java too:)", from_camel("ButILikeJavaToo:)")) + + self.assertEqual("Beatdown french DVD rip.mkv", from_camel("BeatdownFrenchDVDRip.mkv")) + self.assertEqual("DO NOTHING ON UPPER CASE", from_camel("DO NOTHING ON UPPER CASE")) + + self.assertFalse(is_camel("this_is_not_camel")) + self.assertTrue(is_camel("ThisIsCamel")) + + self.assertEqual("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv", from_camel("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv")) + self.assertFalse(is_camel("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv")) + + self.assertEqual("A2LiNE", from_camel("A2LiNE")) + + def test_date(self): + self.assertEqual(search_year(' in the year 2000... '), (2000, (13, 17))) + self.assertEqual(search_year(' they arrived in 1492. '), (None, None)) + + today = date.today() + today_year_2 = int(str(today.year)[2:]) + + future = today + timedelta(days=1000) + future_year_2 = int(str(future.year)[2:]) + + past = today - timedelta(days=10000) + past_year_2 = int(str(past.year)[2:]) + + self.assertEqual(search_date(' Something before 2002-04-22 '), (date(2002, 4, 22), (18, 28))) + self.assertEqual(search_date(' 2002-04-22 Something after '), (date(2002, 4, 22), (1, 11))) + + self.assertEqual(search_date(' This happened on 2002-04-22. '), (date(2002, 4, 22), (18, 28))) + self.assertEqual(search_date(' This happened on 22-04-2002. '), (date(2002, 4, 22), (18, 28))) + + self.assertEqual(search_date(' This happened on 13-04-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) + self.assertEqual(search_date(' This happened on 22-04-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) + self.assertEqual(search_date(' This happened on 20-04-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) + + self.assertEqual(search_date(' This happened on 13-06-14. ', year_first=True), (date(2013, 6, 14), (18, 26))) + self.assertEqual(search_date(' This happened on 13-05-14. ', year_first=False), (date(2014, 5, 13), (18, 26))) + + self.assertEqual(search_date(' This happened on 04-13-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) + self.assertEqual(search_date(' This happened on 04-22-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) + self.assertEqual(search_date(' This happened on 04-20-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) + + self.assertEqual(search_date(' This happened on 35-12-%s. ' % (today_year_2,)), (None, None)) + self.assertEqual(search_date(' This happened on 37-18-%s. ' % (future_year_2,)), (None, None)) + self.assertEqual(search_date(' This happened on 44-42-%s. ' % (past_year_2)), (None, None)) + + self.assertEqual(search_date(' This happened on %s. ' % (today, )), (today, (18, 28))) + self.assertEqual(search_date(' This happened on %s. ' % (future, )), (future, (18, 28))) + self.assertEqual(search_date(' This happened on %s. ' % (past, )), (past, (18, 28))) + + self.assertEqual(search_date(' released date: 04-03-1901? '), (None, None)) + + self.assertEqual(search_date(' There\'s no date in here. '), (None, None)) + + self.assertEqual(search_date(' Something 01-02-03 '), (date(2003, 2, 1), (11, 19))) + self.assertEqual(search_date(' Something 01-02-03 ', year_first=False, day_first=True), (date(2003, 2, 1), (11, 19))) + self.assertEqual(search_date(' Something 01-02-03 ', year_first=True), (date(2001, 2, 3), (11, 19))) + self.assertEqual(search_date(' Something 01-02-03 ', day_first=False), (date(2003, 1, 2), (11, 19))) + + +suite = allTests(TestUtils) + +if __name__ == '__main__': + TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/textutils.py b/libs/guessit/textutils.py index ae9d28c3..3537aa3b 100644 --- a/libs/guessit/textutils.py +++ b/libs/guessit/textutils.py @@ -1,24 +1,25 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Smewt - A smart collection manager -# Copyright (c) 2008-2012 Nicolas Wack +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack # -# Smewt is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # -# Smewt is distributed in the hope that it will be useful, +# GuessIt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# Lesser GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License +# You should have received a copy of the Lesser GNU General Public License # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + from guessit import s from guessit.patterns import sep import functools @@ -27,6 +28,7 @@ import re # string-related functions + def normalize_unicode(s): return unicodedata.normalize('NFC', s) @@ -36,48 +38,70 @@ def strip_brackets(s): return s if ((s[0] == '[' and s[-1] == ']') or - (s[0] == '(' and s[-1] == ')') or - (s[0] == '{' and s[-1] == '}')): + (s[0] == '(' and s[-1] == ')') or + (s[0] == '{' and s[-1] == '}')): return s[1:-1] return s -def clean_string(st): +_dotted_rexp = re.compile(r'(?:\W|^)(([A-Za-z]\.){2,}[A-Za-z]\.?)') + + +def clean_default(st): for c in sep: # do not remove certain chars if c in ['-', ',']: continue + + if c == '.': + # we should not remove the dots for acronyms and such + dotted = _dotted_rexp.search(st) + if dotted: + s = dotted.group(1) + exclude_begin, exclude_end = dotted.span(1) + + st = (st[:exclude_begin].replace(c, ' ') + + st[exclude_begin:exclude_end] + + st[exclude_end:].replace(c, ' ')) + continue + st = st.replace(c, ' ') + parts = st.split() result = ' '.join(p for p in parts if p != '') # now also remove dashes on the outer part of the string - while result and result[0] in sep: + while result and result[0] in '-': result = result[1:] - while result and result[-1] in sep: + while result and result[-1] in '-': result = result[:-1] return result - _words_rexp = re.compile('\w+', re.UNICODE) + def find_words(s): return _words_rexp.findall(s.replace('_', ' ')) -def reorder_title(title): +def iter_words(s): + return _words_rexp.finditer(s.replace('_', ' ')) + + +def reorder_title(title, articles=('the',), separators=(',', ', ')): ltitle = title.lower() - if ltitle[-4:] == ',the': - return title[-3:] + ' ' + title[:-4] - if ltitle[-5:] == ', the': - return title[-3:] + ' ' + title[:-5] + for article in articles: + for separator in separators: + suffix = separator + article + if ltitle[-len(suffix):] == suffix: + return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)] return title def str_replace(string, pos, c): - return string[:pos] + c + string[pos+1:] + return string[:pos] + c + string[pos + 1:] def str_fill(string, region, c): @@ -85,7 +109,6 @@ def str_fill(string, region, c): return string[:start] + c * (end - start) + string[end:] - def levenshtein(a, b): if not a: return len(b) @@ -95,25 +118,25 @@ def levenshtein(a, b): m = len(a) n = len(b) d = [] - for i in range(m+1): - d.append([0] * (n+1)) + for i in range(m + 1): + d.append([0] * (n + 1)) - for i in range(m+1): + for i in range(m + 1): d[i][0] = i - for j in range(n+1): + for j in range(n + 1): d[0][j] = j - for i in range(1, m+1): - for j in range(1, n+1): - if a[i-1] == b[j-1]: + for i in range(1, m + 1): + for j in range(1, n + 1): + if a[i - 1] == b[j - 1]: cost = 0 else: cost = 1 - d[i][j] = min(d[i-1][j] + 1, # deletion - d[i][j-1] + 1, # insertion - d[i-1][j-1] + cost # substitution + d[i][j] = min(d[i - 1][j] + 1, # deletion + d[i][j - 1] + 1, # insertion + d[i - 1][j - 1] + cost # substitution ) return d[m][n] @@ -140,7 +163,7 @@ def find_first_level_groups_span(string, enclosing): [(2, 5), (7, 10)] """ opening, closing = enclosing - depth = [] # depth is a stack of indices where we opened a group + depth = [] # depth is a stack of indices where we opened a group result = [] for i, c, in enumerate(string): if c == opening: @@ -151,7 +174,7 @@ def find_first_level_groups_span(string, enclosing): end = i if not depth: # we emptied our stack, so we have a 1st level group - result.append((start, end+1)) + result.append((start, end + 1)) except IndexError: # we closed a group which was not opened before pass @@ -172,7 +195,7 @@ def split_on_groups(string, groups): """ if not groups: - return [ string ] + return [string] boundaries = sorted(set(functools.reduce(lambda l, x: l + list(x), groups, []))) if boundaries[0] != 0: @@ -180,10 +203,10 @@ def split_on_groups(string, groups): if boundaries[-1] != len(string): boundaries.append(len(string)) - groups = [ string[start:end] for start, end in zip(boundaries[:-1], - boundaries[1:]) ] + groups = [string[start:end] for start, end in zip(boundaries[:-1], + boundaries[1:])] - return [ g for g in groups if g ] # return only non-empty groups + return [g for g in groups if g] # return only non-empty groups def find_first_level_groups(string, enclosing, blank_sep=None): @@ -219,6 +242,114 @@ def find_first_level_groups(string, enclosing, blank_sep=None): if blank_sep: for start, end in groups: string = str_replace(string, start, blank_sep) - string = str_replace(string, end-1, blank_sep) + string = str_replace(string, end - 1, blank_sep) return split_on_groups(string, groups) + + +_camel_word2_set = set(('is', 'to',)) +_camel_word3_set = set(('the',)) + + +def _camel_split_and_lower(string, i): + """Retrieves a tuple (need_split, need_lower) + + need_split is True if this char is a first letter in a camelCasedString. + need_lower is True if this char should be lowercased. + """ + + def islower(c): + return c.isalpha() and not c.isupper() + + previous_char2 = string[i - 2] if i > 1 else None + previous_char = string[i - 1] if i > 0 else None + char = string[i] + next_char = string[i + 1] if i + 1 < len(string) else None + next_char2 = string[i + 2] if i + 2 < len(string) else None + + char_upper = char.isupper() + char_lower = islower(char) + + # previous_char2_lower = islower(previous_char2) if previous_char2 else False + previous_char2_upper = previous_char2.isupper() if previous_char2 else False + + previous_char_lower = islower(previous_char) if previous_char else False + previous_char_upper = previous_char.isupper() if previous_char else False + + next_char_upper = next_char.isupper() if next_char else False + next_char_lower = islower(next_char) if next_char else False + + next_char2_upper = next_char2.isupper() if next_char2 else False + # next_char2_lower = islower(next_char2) if next_char2 else False + + mixedcase_word = (previous_char_upper and char_lower and next_char_upper) or \ + (previous_char_lower and char_upper and next_char_lower and next_char2_upper) or \ + (previous_char2_upper and previous_char_lower and char_upper) + if mixedcase_word: + word2 = (char + next_char).lower() if next_char else None + word3 = (char + next_char + next_char2).lower() if next_char and next_char2 else None + word2b = (previous_char2 + previous_char).lower() if previous_char2 and previous_char else None + if word2 in _camel_word2_set or word2b in _camel_word2_set or word3 in _camel_word3_set: + mixedcase_word = False + + uppercase_word = previous_char_upper and char_upper and next_char_upper or (char_upper and next_char_upper and next_char2_upper) + + need_split = char_upper and previous_char_lower and not mixedcase_word + + if not need_split: + previous_char_upper = string[i - 1].isupper() if i > 0 else False + next_char_lower = (string[i + 1].isalpha() and not string[i + 1].isupper()) if i + 1 < len(string) else False + need_split = char_upper and previous_char_upper and next_char_lower + uppercase_word = previous_char_upper and not next_char_lower + + need_lower = not uppercase_word and not mixedcase_word and need_split + + return (need_split, need_lower) + + +def is_camel(string): + """ + >>> is_camel('dogEATDog') + True + >>> is_camel('DeathToCamelCase') + True + >>> is_camel('death_to_camel_case') + False + >>> is_camel('TheBest') + True + >>> is_camel('The Best') + False + """ + for i in range(0, len(string)): + need_split, _ = _camel_split_and_lower(string, i) + if need_split: + return True + return False + + +def from_camel(string): + """ + >>> from_camel('dogEATDog') == 'dog EAT dog' + True + >>> from_camel('DeathToCamelCase') == 'Death to camel case' + True + >>> from_camel('TheBest') == 'The best' + True + >>> from_camel('MiXedCaSe is not camelCase') == 'MiXedCaSe is not camel case' + True + """ + if not string: + return string + pieces = [] + + for i in range(0, len(string)): + char = string[i] + need_split, need_lower = _camel_split_and_lower(string, i) + if need_split: + pieces.append(' ') + + if need_lower: + pieces.append(char.lower()) + else: + pieces.append(char) + return ''.join(pieces) diff --git a/libs/guessit/tlds-alpha-by-domain.txt b/libs/guessit/tlds-alpha-by-domain.txt new file mode 100644 index 00000000..280c794c --- /dev/null +++ b/libs/guessit/tlds-alpha-by-domain.txt @@ -0,0 +1,341 @@ +# Version 2013112900, Last Updated Fri Nov 29 07:07:01 2013 UTC +AC +AD +AE +AERO +AF +AG +AI +AL +AM +AN +AO +AQ +AR +ARPA +AS +ASIA +AT +AU +AW +AX +AZ +BA +BB +BD +BE +BF +BG +BH +BI +BIKE +BIZ +BJ +BM +BN +BO +BR +BS +BT +BV +BW +BY +BZ +CA +CAMERA +CAT +CC +CD +CF +CG +CH +CI +CK +CL +CLOTHING +CM +CN +CO +COM +CONSTRUCTION +CONTRACTORS +COOP +CR +CU +CV +CW +CX +CY +CZ +DE +DIAMONDS +DIRECTORY +DJ +DK +DM +DO +DZ +EC +EDU +EE +EG +ENTERPRISES +EQUIPMENT +ER +ES +ESTATE +ET +EU +FI +FJ +FK +FM +FO +FR +GA +GALLERY +GB +GD +GE +GF +GG +GH +GI +GL +GM +GN +GOV +GP +GQ +GR +GRAPHICS +GS +GT +GU +GURU +GW +GY +HK +HM +HN +HOLDINGS +HR +HT +HU +ID +IE +IL +IM +IN +INFO +INT +IO +IQ +IR +IS +IT +JE +JM +JO +JOBS +JP +KE +KG +KH +KI +KITCHEN +KM +KN +KP +KR +KW +KY +KZ +LA +LAND +LB +LC +LI +LIGHTING +LK +LR +LS +LT +LU +LV +LY +MA +MC +MD +ME +MG +MH +MIL +MK +ML +MM +MN +MO +MOBI +MP +MQ +MR +MS +MT +MU +MUSEUM +MV +MW +MX +MY +MZ +NA +NAME +NC +NE +NET +NF +NG +NI +NL +NO +NP +NR +NU +NZ +OM +ORG +PA +PE +PF +PG +PH +PHOTOGRAPHY +PK +PL +PLUMBING +PM +PN +POST +PR +PRO +PS +PT +PW +PY +QA +RE +RO +RS +RU +RW +SA +SB +SC +SD +SE +SEXY +SG +SH +SI +SINGLES +SJ +SK +SL +SM +SN +SO +SR +ST +SU +SV +SX +SY +SZ +TATTOO +TC +TD +TECHNOLOGY +TEL +TF +TG +TH +TIPS +TJ +TK +TL +TM +TN +TO +TODAY +TP +TR +TRAVEL +TT +TV +TW +TZ +UA +UG +UK +US +UY +UZ +VA +VC +VE +VENTURES +VG +VI +VN +VOYAGE +VU +WF +WS +XN--3E0B707E +XN--45BRJ9C +XN--80AO21A +XN--80ASEHDB +XN--80ASWG +XN--90A3AC +XN--CLCHC0EA0B2G2A9GCD +XN--FIQS8S +XN--FIQZ9S +XN--FPCRJ9C3D +XN--FZC2C9E2C +XN--GECRJ9C +XN--H2BRJ9C +XN--J1AMH +XN--J6W193G +XN--KPRW13D +XN--KPRY57D +XN--L1ACC +XN--LGBBAT1AD8J +XN--MGB9AWBF +XN--MGBA3A4F16A +XN--MGBAAM7A8H +XN--MGBAYH7GPA +XN--MGBBH1A71E +XN--MGBC0A9AZCG +XN--MGBERP4A5D4AR +XN--MGBX4CD0AB +XN--NGBC5AZD +XN--O3CW4H +XN--OGBPF8FL +XN--P1AI +XN--PGBS0DH +XN--Q9JYB4C +XN--S9BRJ9C +XN--UNUP4Y +XN--WGBH1C +XN--WGBL6A +XN--XKC2AL3HYE2A +XN--XKC2DL3A5EE0H +XN--YFRO4I67O +XN--YGBI2AMMX +XXX +YE +YT +ZA +ZM +ZW diff --git a/libs/guessit/transfo/__init__.py b/libs/guessit/transfo/__init__.py index a28aa988..cce2dfda 100644 --- a/libs/guessit/transfo/__init__.py +++ b/libs/guessit/transfo/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,92 +18,13 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import base_text_type, Guess -from guessit.patterns import canonical_form -from guessit.textutils import clean_string -import logging - -log = logging.getLogger(__name__) +from __future__ import absolute_import, division, print_function, unicode_literals -def found_property(node, name, confidence): - node.guess = Guess({name: node.clean_value}, confidence=confidence, raw=node.value) - log.debug('Found with confidence %.2f: %s' % (confidence, node.guess)) +class TransformerException(Exception): + def __init__(self, transformer, message): + # Call the base class constructor with the parameters it needs + Exception.__init__(self, message) -def format_guess(guess): - """Format all the found values to their natural type. - For instance, a year would be stored as an int value, etc... - - Note that this modifies the dictionary given as input. - """ - for prop, value in guess.items(): - if prop in ('season', 'episodeNumber', 'year', 'cdNumber', - 'cdNumberTotal', 'bonusNumber', 'filmNumber'): - guess[prop] = int(guess[prop]) - elif isinstance(value, base_text_type): - if prop in ('edition',): - value = clean_string(value) - guess[prop] = canonical_form(value).replace('\\', '') - - return guess - - -def find_and_split_node(node, strategy, logger): - string = ' %s ' % node.value # add sentinels - for matcher, confidence, args, kwargs in strategy: - all_args = [string] - if getattr(matcher, 'use_node', False): - all_args.append(node) - if args: - all_args.append(args) - - if kwargs: - result, span = matcher(*all_args, **kwargs) - else: - result, span = matcher(*all_args) - - if result: - # readjust span to compensate for sentinels - span = (span[0] - 1, span[1] - 1) - - if isinstance(result, Guess): - if confidence is None: - confidence = result.confidence(list(result.keys())[0]) - else: - if confidence is None: - confidence = 1.0 - - guess = format_guess(Guess(result, confidence=confidence, raw=string[span[0] + 1:span[1] + 1])) - msg = 'Found with confidence %.2f: %s' % (confidence, guess) - (logger or log).debug(msg) - - node.partition(span) - absolute_span = (span[0] + node.offset, span[1] + node.offset) - for child in node.children: - if child.span == absolute_span: - child.guess = guess - else: - find_and_split_node(child, strategy, logger) - return - - -class SingleNodeGuesser(object): - def __init__(self, guess_func, confidence, logger, *args, **kwargs): - self.guess_func = guess_func - self.confidence = confidence - self.logger = logger - self.args = args - self.kwargs = kwargs - - def process(self, mtree): - # strategy is a list of pairs (guesser, confidence) - # - if the guesser returns a guessit.Guess and confidence is specified, - # it will override it, otherwise it will leave the guess confidence - # - if the guesser returns a simple dict as a guess and confidence is - # specified, it will use it, or 1.0 otherwise - strategy = [ (self.guess_func, self.confidence, self.args, self.kwargs) ] - - for node in mtree.unidentified_leaves(): - find_and_split_node(node, strategy, self.logger) + self.transformer = transformer \ No newline at end of file diff --git a/libs/guessit/transfo/expected_series.py b/libs/guessit/transfo/expected_series.py new file mode 100644 index 00000000..edbd46d4 --- /dev/null +++ b/libs/guessit/transfo/expected_series.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals +from guessit.containers import PropertiesContainer +from guessit.matcher import GuessFinder + +from guessit.plugins.transformers import Transformer + +import re + + +class ExpectedSeries(Transformer): + def __init__(self): + Transformer.__init__(self, 230) + + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + naming_opts.add_argument('-S', '--expected-series', action='append', dest='expected_series', + help='Expected series to parse (can be used multiple times)') + + def should_process(self, mtree, options=None): + return options and options.get('expected_series') + + def expected_series(self, string, node=None, options=None): + container = PropertiesContainer(enhance=True, canonical_from_pattern=False) + + for expected_serie in options.get('expected_series'): + if expected_serie.startswith('re:'): + expected_serie = expected_serie[3:] + expected_serie = expected_serie.replace(' ', '-') + container.register_property('series', expected_serie, enhance=True) + else: + expected_serie = re.escape(expected_serie) + container.register_property('series', expected_serie, enhance=False) + + found = container.find_properties(string, node, options) + return container.as_guess(found, string) + + def supported_properties(self): + return ['series'] + + def process(self, mtree, options=None): + GuessFinder(self.expected_series, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/expected_title.py b/libs/guessit/transfo/expected_title.py new file mode 100644 index 00000000..2fe3d20e --- /dev/null +++ b/libs/guessit/transfo/expected_title.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.containers import PropertiesContainer +from guessit.matcher import GuessFinder + +from guessit.plugins.transformers import Transformer + +import re + + +class ExpectedTitle(Transformer): + def __init__(self): + Transformer.__init__(self, 225) + + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + naming_opts.add_argument('-T', '--expected-title', action='append', dest='expected_title', + help='Expected title (can be used multiple times)') + + def should_process(self, mtree, options=None): + return options and options.get('expected_title') + + def expected_titles(self, string, node=None, options=None): + container = PropertiesContainer(enhance=True, canonical_from_pattern=False) + + for expected_title in options.get('expected_title'): + if expected_title.startswith('re:'): + expected_title = expected_title[3:] + expected_title = expected_title.replace(' ', '-') + container.register_property('title', expected_title, enhance=True) + else: + expected_title = re.escape(expected_title) + container.register_property('title', expected_title, enhance=False) + + found = container.find_properties(string, node, options) + return container.as_guess(found, string) + + def supported_properties(self): + return ['title'] + + def process(self, mtree, options=None): + GuessFinder(self.expected_titles, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_bonus_features.py b/libs/guessit/transfo/guess_bonus_features.py index 8c7ac013..c70b31e5 100644 --- a/libs/guessit/transfo/guess_bonus_features.py +++ b/libs/guessit/transfo/guess_bonus_features.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,44 +18,50 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import found_property -import logging +from __future__ import absolute_import, division, print_function, unicode_literals -log = logging.getLogger(__name__) +from guessit.plugins.transformers import Transformer +from guessit.matcher import found_property -def process(mtree): - def previous_group(g): - for leaf in mtree.unidentified_leaves()[::-1]: - if leaf.node_idx < g.node_idx: - return leaf +class GuessBonusFeatures(Transformer): + def __init__(self): + Transformer.__init__(self, -150) - def next_group(g): - for leaf in mtree.unidentified_leaves(): - if leaf.node_idx > g.node_idx: - return leaf + def supported_properties(self): + return ['bonusNumber', 'bonusTitle', 'filmNumber', 'filmSeries', 'title', 'series'] - def same_group(g1, g2): - return g1.node_idx[:2] == g2.node_idx[:2] + def process(self, mtree, options=None): + def previous_group(g): + for leaf in reversed(list(mtree.unidentified_leaves())): + if leaf.node_idx < g.node_idx: + return leaf - bonus = [ node for node in mtree.leaves() if 'bonusNumber' in node.guess ] - if bonus: - bonusTitle = next_group(bonus[0]) - if same_group(bonusTitle, bonus[0]): - found_property(bonusTitle, 'bonusTitle', 0.8) + def next_group(g): + for leaf in mtree.unidentified_leaves(): + if leaf.node_idx > g.node_idx: + return leaf - filmNumber = [ node for node in mtree.leaves() - if 'filmNumber' in node.guess ] - if filmNumber: - filmSeries = previous_group(filmNumber[0]) - found_property(filmSeries, 'filmSeries', 0.9) + def same_group(g1, g2): + return g1.node_idx[:2] == g2.node_idx[:2] - title = next_group(filmNumber[0]) - found_property(title, 'title', 0.9) + bonus = [node for node in mtree.leaves() if 'bonusNumber' in node.guess] + if bonus: + bonus_title = next_group(bonus[0]) + if bonus_title and same_group(bonus_title, bonus[0]): + found_property(bonus_title, 'bonusTitle', confidence=0.8) - season = [ node for node in mtree.leaves() if 'season' in node.guess ] - if season and 'bonusNumber' in mtree.info: - series = previous_group(season[0]) - if same_group(series, season[0]): - found_property(series, 'series', 0.9) + film_number = [node for node in mtree.leaves() + if 'filmNumber' in node.guess] + if film_number: + film_series = previous_group(film_number[0]) + found_property(film_series, 'filmSeries', confidence=0.9) + + title = next_group(film_number[0]) + found_property(title, 'title', confidence=0.9) + + season = [node for node in mtree.leaves() if 'season' in node.guess] + if season and 'bonusNumber' in mtree.info: + series = previous_group(season[0]) + if same_group(series, season[0]): + found_property(series, 'series', confidence=0.9) diff --git a/libs/guessit/transfo/guess_country.py b/libs/guessit/transfo/guess_country.py index aadb84f7..c08cac7b 100644 --- a/libs/guessit/transfo/guess_country.py +++ b/libs/guessit/transfo/guess_country.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,31 +18,107 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.country import Country +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer +from babelfish import Country from guessit import Guess +from guessit.textutils import iter_words +from guessit.matcher import GuessFinder, found_guess +from guessit.language import LNG_COMMON_WORDS +import babelfish import logging log = logging.getLogger(__name__) -# list of common words which could be interpreted as countries, but which -# are far too common to be able to say they represent a country -country_common_words = frozenset([ 'bt', 'bb' ]) -def process(mtree): - for node in mtree.unidentified_leaves(): - if len(node.node_idx) == 2: - c = node.value[1:-1].lower() - if c in country_common_words: - continue +class GuessCountry(Transformer): + def __init__(self): + Transformer.__init__(self, -170) + self.replace_language = frozenset(['uk']) - # only keep explicit groups (enclosed in parentheses/brackets) - if node.value[0] + node.value[-1] not in ['()', '[]', '{}']: - continue + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + naming_opts.add_argument('-C', '--allowed-country', action='append', dest='allowed_countries', + help='Allowed country (can be used multiple times)') + def supported_properties(self): + return ['country'] + + def should_process(self, mtree, options=None): + options = options or {} + return options.get('country', True) + + def _scan_country(self, country, strict=False): + """ + Find a country if it is at the start or end of country string + """ + words_match = list(iter_words(country.lower())) + s = "" + start = None + + for word_match in words_match: + if not start: + start = word_match.start(0) + s += word_match.group(0) try: - country = Country(c, strict=True) - except ValueError: + return Country.fromguessit(s), (start, word_match.end(0)) + except babelfish.Error: continue - node.guess = Guess(country=country, confidence=1.0, raw=c) + words_match.reverse() + s = "" + end = None + for word_match in words_match: + if not end: + end = word_match.end(0) + s = word_match.group(0) + s + try: + return Country.fromguessit(s), (word_match.start(0), end) + except babelfish.Error: + continue + + return Country.fromguessit(country), (start, end) + + def is_valid_country(self, country, options=None): + if options and options.get('allowed_countries'): + allowed_countries = options.get('allowed_countries') + return country.name.lower() in allowed_countries or country.alpha2.lower() in allowed_countries + else: + return (country.name.lower() not in LNG_COMMON_WORDS and + country.alpha2.lower() not in LNG_COMMON_WORDS) + + def guess_country(self, string, node=None, options=None): + c = string.strip().lower() + if c not in LNG_COMMON_WORDS: + try: + country, country_span = self._scan_country(c, True) + if self.is_valid_country(country, options): + guess = Guess(country=country, confidence=1.0, input=node.value, span=(country_span[0] + 1, country_span[1] + 1)) + return guess + except babelfish.Error: + pass + return None, None + + def process(self, mtree, options=None): + GuessFinder(self.guess_country, None, self.log, options).process_nodes(mtree.unidentified_leaves()) + for node in mtree.leaves_containing('language'): + c = node.clean_value.lower() + if c in self.replace_language: + node.guess.set('language', None) + try: + country = Country.fromguessit(c) + if self.is_valid_country(country, options): + guess = Guess(country=country, confidence=0.9, input=node.value, span=node.span) + found_guess(node, guess, logger=log) + except babelfish.Error: + pass + + def post_process(self, mtree, options=None, *args, **kwargs): + # if country is in the guessed properties, make it part of the series name + series_leaves = list(mtree.leaves_containing('series')) + country_leaves = list(mtree.leaves_containing('country')) + + if series_leaves and country_leaves: + country_leaf = country_leaves[0] + for serie_leaf in series_leaves: + serie_leaf.guess['series'] += ' (%s)' % str(country_leaf.guess['country'].guessit) diff --git a/libs/guessit/transfo/guess_date.py b/libs/guessit/transfo/guess_date.py index 34a85989..73fa246d 100644 --- a/libs/guessit/transfo/guess_date.py +++ b/libs/guessit/transfo/guess_date.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,21 +18,32 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import SingleNodeGuesser +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder from guessit.date import search_date -import logging - -log = logging.getLogger(__name__) -def guess_date(string): - date, span = search_date(string) - if date: - return { 'date': date }, span - else: - return None, None +class GuessDate(Transformer): + def __init__(self): + Transformer.__init__(self, 50) + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + naming_opts.add_argument('-Y', '--date-year-first', action='store_true', dest='date_year_first', default=None, + help='If short date is found, consider the first digits as the year.') + naming_opts.add_argument('-D', '--date-day-first', action='store_true', dest='date_day_first', default=None, + help='If short date is found, consider the second digits as the day.') -def process(mtree): - SingleNodeGuesser(guess_date, 1.0, log).process(mtree) + def supported_properties(self): + return ['date'] + + def guess_date(self, string, node=None, options=None): + date, span = search_date(string, options.get('date_year_first') if options else False, options.get('date_day_first') if options else False) + if date: + return {'date': date}, span + else: + return None, None + + def process(self, mtree, options=None): + GuessFinder(self.guess_date, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_episode_details.py b/libs/guessit/transfo/guess_episode_details.py new file mode 100644 index 00000000..ba7ff298 --- /dev/null +++ b/libs/guessit/transfo/guess_episode_details.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer +from guessit.matcher import found_guess +from guessit.containers import PropertiesContainer +import itertools + + +class GuessEpisodeDetails(Transformer): + def __init__(self): + Transformer.__init__(self, -205) + self.container = PropertiesContainer() + self.container.register_property('episodeDetails', 'Special', 'Bonus', 'Omake', 'Ova', 'Oav', 'Pilot', 'Unaired') + self.container.register_property('episodeDetails', 'Extras?', canonical_form='Extras') + + def guess_details(self, string, node=None, options=None): + properties = self.container.find_properties(string, node, options, 'episodeDetails', multiple=True) + guesses = self.container.as_guess(properties, multiple=True) + return guesses + + def second_pass_options(self, mtree, options=None): + if not mtree.guess.get('type', '').startswith('episode'): + for unidentified_leaf in mtree.unidentified_leaves(): + properties = self.container.find_properties(unidentified_leaf.value, unidentified_leaf, options, 'episodeDetails') + guess = self.container.as_guess(properties) + if guess: + return {'type': 'episode'} + return None + + def supported_properties(self): + return self.container.get_supported_properties() + + def process(self, mtree, options=None): + if (mtree.guess.get('type', '').startswith('episode') and + (not mtree.info.get('episodeNumber') or + mtree.info.get('season') == 0)): + + for leaf in itertools.chain(mtree.leaves_containing('title'), + mtree.unidentified_leaves()): + guesses = self.guess_details(leaf.value, leaf, options) + for guess in guesses: + found_guess(leaf, guess, update_guess=False) + + return None diff --git a/libs/guessit/transfo/guess_episode_info_from_position.py b/libs/guessit/transfo/guess_episode_info_from_position.py index 967c3341..ad8973dd 100644 --- a/libs/guessit/transfo/guess_episode_info_from_position.py +++ b/libs/guessit/transfo/guess_episode_info_from_position.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,129 +18,164 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import found_property -from guessit.patterns import non_episode_title, unlikely_series -import logging +from __future__ import absolute_import, division, print_function, unicode_literals -log = logging.getLogger(__name__) +from guessit.plugins.transformers import Transformer, get_transformer +from guessit.textutils import reorder_title + +from guessit.matcher import found_property -def match_from_epnum_position(mtree, node): - epnum_idx = node.node_idx +class GuessEpisodeInfoFromPosition(Transformer): + def __init__(self): + Transformer.__init__(self, -200) - # a few helper functions to be able to filter using high-level semantics - def before_epnum_in_same_pathgroup(): - return [ leaf for leaf in mtree.unidentified_leaves() - if (leaf.node_idx[0] == epnum_idx[0] and - leaf.node_idx[1:] < epnum_idx[1:]) ] + def supported_properties(self): + return ['title', 'series'] - def after_epnum_in_same_pathgroup(): - return [ leaf for leaf in mtree.unidentified_leaves() - if (leaf.node_idx[0] == epnum_idx[0] and - leaf.node_idx[1:] > epnum_idx[1:]) ] + def match_from_epnum_position(self, mtree, node, options): + epnum_idx = node.node_idx - def after_epnum_in_same_explicitgroup(): - return [ leaf for leaf in mtree.unidentified_leaves() - if (leaf.node_idx[:2] == epnum_idx[:2] and - leaf.node_idx[2:] > epnum_idx[2:]) ] + # a few helper functions to be able to filter using high-level semantics + def before_epnum_in_same_pathgroup(): + return [leaf for leaf in mtree.unidentified_leaves(lambda x: len(x.clean_value) > 1) + if (leaf.node_idx[0] == epnum_idx[0] and + leaf.node_idx[1:] < epnum_idx[1:])] - # epnumber is the first group and there are only 2 after it in same - # path group - # -> series title - episode title - title_candidates = [ n for n in after_epnum_in_same_pathgroup() - if n.clean_value.lower() not in non_episode_title ] - if ('title' not in mtree.info and # no title - before_epnum_in_same_pathgroup() == [] and # no groups before - len(title_candidates) == 2): # only 2 groups after + def after_epnum_in_same_pathgroup(): + return [leaf for leaf in mtree.unidentified_leaves(lambda x: len(x.clean_value) > 1) + if (leaf.node_idx[0] == epnum_idx[0] and + leaf.node_idx[1:] > epnum_idx[1:])] - found_property(title_candidates[0], 'series', confidence=0.4) - found_property(title_candidates[1], 'title', confidence=0.4) - return + def after_epnum_in_same_explicitgroup(): + return [leaf for leaf in mtree.unidentified_leaves(lambda x: len(x.clean_value) > 1) + if (leaf.node_idx[:2] == epnum_idx[:2] and + leaf.node_idx[2:] > epnum_idx[2:])] - # if we have at least 1 valid group before the episodeNumber, then it's - # probably the series name - series_candidates = before_epnum_in_same_pathgroup() - if len(series_candidates) >= 1: - found_property(series_candidates[0], 'series', confidence=0.7) + # epnumber is the first group and there are only 2 after it in same + # path group + # -> series title - episode title + title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup(), options) - # only 1 group after (in the same path group) and it's probably the - # episode title - title_candidates = [ n for n in after_epnum_in_same_pathgroup() - if n.clean_value.lower() not in non_episode_title ] + if ('title' not in mtree.info and # no title + 'series' in mtree.info and # series present + before_epnum_in_same_pathgroup() == [] and # no groups before + len(title_candidates) == 1): # only 1 group after - if len(title_candidates) == 1: - found_property(title_candidates[0], 'title', confidence=0.5) - return - else: - # try in the same explicit group, with lower confidence - title_candidates = [ n for n in after_epnum_in_same_explicitgroup() - if n.clean_value.lower() not in non_episode_title - ] - if len(title_candidates) == 1: found_property(title_candidates[0], 'title', confidence=0.4) return - elif len(title_candidates) > 1: - found_property(title_candidates[0], 'title', confidence=0.3) + + if ('title' not in mtree.info and # no title + before_epnum_in_same_pathgroup() == [] and # no groups before + len(title_candidates) == 2): # only 2 groups after + + found_property(title_candidates[0], 'series', confidence=0.4) + found_property(title_candidates[1], 'title', confidence=0.4) return - # get the one with the longest value - title_candidates = [ n for n in after_epnum_in_same_pathgroup() - if n.clean_value.lower() not in non_episode_title ] - if title_candidates: - maxidx = -1 - maxv = -1 - for i, c in enumerate(title_candidates): - if len(c.clean_value) > maxv: - maxidx = i - maxv = len(c.clean_value) - found_property(title_candidates[maxidx], 'title', confidence=0.3) + # if we have at least 1 valid group before the episodeNumber, then it's + # probably the series name + series_candidates = before_epnum_in_same_pathgroup() + if len(series_candidates) >= 1: + found_property(series_candidates[0], 'series', confidence=0.7) + # only 1 group after (in the same path group) and it's probably the + # episode title. + title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup(), options) + if len(title_candidates) == 1: + found_property(title_candidates[0], 'title', confidence=0.5) + return + else: + # try in the same explicit group, with lower confidence + title_candidates = self._filter_candidates(after_epnum_in_same_explicitgroup(), options) + if len(title_candidates) == 1: + found_property(title_candidates[0], 'title', confidence=0.4) + return + elif len(title_candidates) > 1: + found_property(title_candidates[0], 'title', confidence=0.3) + return -def process(mtree): - eps = [node for node in mtree.leaves() if 'episodeNumber' in node.guess] - if eps: - match_from_epnum_position(mtree, eps[0]) + # get the one with the longest value + title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup(), options) + if title_candidates: + maxidx = -1 + maxv = -1 + for i, c in enumerate(title_candidates): + if len(c.clean_value) > maxv: + maxidx = i + maxv = len(c.clean_value) + found_property(title_candidates[maxidx], 'title', confidence=0.3) - else: - # if we don't have the episode number, but at least 2 groups in the - # basename, then it's probably series - eptitle - basename = mtree.node_at((-2,)) - title_candidates = [ n for n in basename.unidentified_leaves() - if n.clean_value.lower() not in non_episode_title - ] + def should_process(self, mtree, options=None): + options = options or {} + return not options.get('skip_title') and mtree.guess.get('type', '').startswith('episode') - if len(title_candidates) >= 2: - found_property(title_candidates[0], 'series', 0.4) - found_property(title_candidates[1], 'title', 0.4) - elif len(title_candidates) == 1: - # but if there's only one candidate, it's probably the series name - found_property(title_candidates[0], 'series', 0.4) + def _filter_candidates(self, candidates, options): + episode_details_transformer = get_transformer('guess_episode_details') + if episode_details_transformer: + return [n for n in candidates if not episode_details_transformer.container.find_properties(n.value, n, options, re_match=True)] + else: + return candidates - # if we only have 1 remaining valid group in the folder containing the - # file, then it's likely that it is the series name - try: - series_candidates = mtree.node_at((-3,)).unidentified_leaves() - except ValueError: - series_candidates = [] + def process(self, mtree, options=None): + """ + try to identify the remaining unknown groups by looking at their + position relative to other known elements + """ + eps = [node for node in mtree.leaves() if 'episodeNumber' in node.guess] - if len(series_candidates) == 1: - found_property(series_candidates[0], 'series', 0.3) + if not eps: + eps = [node for node in mtree.leaves() if 'date' in node.guess] - # if there's a path group that only contains the season info, then the - # previous one is most likely the series title (ie: ../series/season X/..) - eps = [ node for node in mtree.nodes() - if 'season' in node.guess and 'episodeNumber' not in node.guess ] + if eps: + self.match_from_epnum_position(mtree, eps[0], options) - if eps: - previous = [ node for node in mtree.unidentified_leaves() - if node.node_idx[0] == eps[0].node_idx[0] - 1 ] - if len(previous) == 1: - found_property(previous[0], 'series', 0.5) + else: + # if we don't have the episode number, but at least 2 groups in the + # basename, then it's probably series - eptitle + basename = mtree.node_at((-2,)) - # reduce the confidence of unlikely series - for node in mtree.nodes(): - if 'series' in node.guess: - if node.guess['series'].lower() in unlikely_series: - new_confidence = node.guess.confidence('series') * 0.5 - node.guess.set_confidence('series', new_confidence) + title_candidates = self._filter_candidates(basename.unidentified_leaves(), options) + + if len(title_candidates) >= 2 and 'series' not in mtree.info: + found_property(title_candidates[0], 'series', confidence=0.4) + found_property(title_candidates[1], 'title', confidence=0.4) + elif len(title_candidates) == 1: + # but if there's only one candidate, it's probably the series name + found_property(title_candidates[0], 'series' if 'series' not in mtree.info else 'title', confidence=0.4) + + # if we only have 1 remaining valid group in the folder containing the + # file, then it's likely that it is the series name + try: + series_candidates = list(mtree.node_at((-3,)).unidentified_leaves()) + except ValueError: + series_candidates = [] + + if len(series_candidates) == 1: + found_property(series_candidates[0], 'series', confidence=0.3) + + # if there's a path group that only contains the season info, then the + # previous one is most likely the series title (ie: ../series/season X/..) + eps = [node for node in mtree.nodes() + if 'season' in node.guess and 'episodeNumber' not in node.guess] + + if eps: + previous = [node for node in mtree.unidentified_leaves() + if node.node_idx[0] == eps[0].node_idx[0] - 1] + if len(previous) == 1: + found_property(previous[0], 'series', confidence=0.5) + + # If we have found title without any serie name, replace it by the serie name. + if 'series' not in mtree.info and 'title' in mtree.info: + title_leaf = mtree.first_leaf_containing('title') + metadata = title_leaf.guess.metadata('title') + value = title_leaf.guess['title'] + del title_leaf.guess['title'] + title_leaf.guess.set('series', value, metadata=metadata) + + def post_process(self, mtree, options=None): + for node in mtree.nodes(): + if 'series' not in node.guess: + continue + + node.guess['series'] = reorder_title(node.guess['series']) diff --git a/libs/guessit/transfo/guess_episodes_rexps.py b/libs/guessit/transfo/guess_episodes_rexps.py index 30c2ca2f..927c9890 100644 --- a/libs/guessit/transfo/guess_episodes_rexps.py +++ b/libs/guessit/transfo/guess_episodes_rexps.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,49 +18,176 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import Guess -from guessit.transfo import SingleNodeGuesser -from guessit.patterns import episode_rexps +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder +from guessit.patterns import sep, build_or_pattern +from guessit.containers import PropertiesContainer, WeakValidator, NoValidator, ChainedValidator, DefaultValidator, \ + FormatterValidator +from guessit.patterns.numeral import numeral, digital_numeral, parse_numeral import re -import logging - -log = logging.getLogger(__name__) - -def number_list(s): - l = [ int(n) for n in re.sub('[^0-9]+', ' ', s).split() ] - - if len(l) == 2: - # it is an episode interval, return all numbers in between - return range(l[0], l[1]+1) - - return l - -def guess_episodes_rexps(string): - for rexp, confidence, span_adjust in episode_rexps: - match = re.search(rexp, string, re.IGNORECASE) - if match: - span = (match.start() + span_adjust[0], - match.end() + span_adjust[1]) - guess = Guess(match.groupdict(), confidence=confidence, raw=string[span[0]:span[1]]) - - # decide whether we have only a single episode number or an - # episode list - if guess.get('episodeNumber'): - eplist = number_list(guess['episodeNumber']) - guess.set('episodeNumber', eplist[0], confidence=confidence, raw=string[span[0]:span[1]]) - - if len(eplist) > 1: - guess.set('episodeList', eplist, confidence=confidence, raw=string[span[0]:span[1]]) - - if guess.get('bonusNumber'): - eplist = number_list(guess['bonusNumber']) - guess.set('bonusNumber', eplist[0], confidence=confidence, raw=string[span[0]:span[1]]) - - return guess, span - - return None, None -def process(mtree): - SingleNodeGuesser(guess_episodes_rexps, None, log).process(mtree) +class GuessEpisodesRexps(Transformer): + def __init__(self): + Transformer.__init__(self, 20) + + range_separators = ['-', 'to', 'a'] + discrete_separators = ['&', 'and', 'et'] + of_separators = ['of', 'sur', '/', '\\'] + + season_words = ['seasons?', 'saisons?', 'series?'] + episode_words = ['episodes?'] + + season_markers = ['s'] + episode_markers = ['e', 'ep'] + + discrete_sep = sep + for range_separator in range_separators: + discrete_sep = discrete_sep.replace(range_separator, '') + discrete_separators.append(discrete_sep) + all_separators = list(range_separators) + all_separators.extend(discrete_separators) + + self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) + + range_separators_re = re.compile(build_or_pattern(range_separators), re.IGNORECASE) + discrete_separators_re = re.compile(build_or_pattern(discrete_separators), re.IGNORECASE) + all_separators_re = re.compile(build_or_pattern(all_separators), re.IGNORECASE) + of_separators_re = re.compile(build_or_pattern(of_separators, escape=True), re.IGNORECASE) + + season_words_re = re.compile(build_or_pattern(season_words), re.IGNORECASE) + episode_words_re = re.compile(build_or_pattern(episode_words), re.IGNORECASE) + + season_markers_re = re.compile(build_or_pattern(season_markers), re.IGNORECASE) + episode_markers_re = re.compile(build_or_pattern(episode_markers), re.IGNORECASE) + + def list_parser(value, property_list_name, discrete_separators_re=discrete_separators_re, range_separators_re=range_separators_re, allow_discrete=False, fill_gaps=False): + discrete_elements = filter(lambda x: x != '', discrete_separators_re.split(value)) + discrete_elements = [x.strip() for x in discrete_elements] + + proper_discrete_elements = [] + i = 0 + while i < len(discrete_elements): + if i < len(discrete_elements) - 2 and range_separators_re.match(discrete_elements[i+1]): + proper_discrete_elements.append(discrete_elements[i] + discrete_elements[i+1] + discrete_elements[i+2]) + i += 3 + else: + match = range_separators_re.search(discrete_elements[i]) + if match and match.start() == 0: + proper_discrete_elements[i-1] = proper_discrete_elements[i-1] + discrete_elements[i] + elif match and match.end() == len(discrete_elements[i]): + proper_discrete_elements.append(discrete_elements[i] + discrete_elements[i + 1]) + else: + proper_discrete_elements.append(discrete_elements[i]) + i += 1 + + discrete_elements = proper_discrete_elements + + ret = [] + + for discrete_element in discrete_elements: + range_values = filter(lambda x: x != '', range_separators_re.split(discrete_element)) + range_values = [x.strip() for x in range_values] + if len(range_values) > 1: + for x in range(0, len(range_values) - 1): + start_range_ep = parse_numeral(range_values[x]) + end_range_ep = parse_numeral(range_values[x+1]) + for range_ep in range(start_range_ep, end_range_ep + 1): + if range_ep not in ret: + ret.append(range_ep) + else: + discrete_value = parse_numeral(discrete_element) + if discrete_value not in ret: + ret.append(discrete_value) + + if len(ret) > 1: + if not allow_discrete: + valid_ret = list() + # replace discrete elements by ranges + valid_ret.append(ret[0]) + for i in range(0, len(ret) - 1): + previous = valid_ret[len(valid_ret) - 1] + if ret[i+1] < previous: + pass + else: + valid_ret.append(ret[i+1]) + ret = valid_ret + if fill_gaps: + ret = list(range(min(ret), max(ret) + 1)) + if len(ret) > 1: + return {None: ret[0], property_list_name: ret} + if len(ret) > 0: + return ret[0] + return None + + def episode_parser_x(value): + return list_parser(value, 'episodeList', discrete_separators_re=re.compile('x', re.IGNORECASE)) + + def episode_parser_e(value): + return list_parser(value, 'episodeList', discrete_separators_re=re.compile('e', re.IGNORECASE), fill_gaps=True) + + def episode_parser(value): + return list_parser(value, 'episodeList') + + def season_parser(value): + return list_parser(value, 'seasonList') + + class ResolutionCollisionValidator(object): + def validate(self, prop, string, node, match, entry_start, entry_end): + return len(match.group(2)) < 3 # limit + + self.container.register_property(None, r'(' + season_words_re.pattern + sep + '?(?P' + numeral + ')' + sep + '?' + season_words_re.pattern + '?)', confidence=1.0, formatter=parse_numeral) + self.container.register_property(None, r'(' + season_words_re.pattern + sep + '?(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*)' + sep + '?' + season_words_re.pattern + '?)' + sep, confidence=1.0, formatter={None: parse_numeral, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), FormatterValidator('season', lambda x: len(x) > 1 if hasattr(x, '__len__') else False))) + + self.container.register_property(None, r'(' + season_markers_re.pattern + '(?P' + digital_numeral + ')[^0-9]?' + sep + '?(?P(?:e' + digital_numeral + '(?:' + sep + '?[e-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_e, 'season': season_parser}, validator=NoValidator()) + # self.container.register_property(None, r'[^0-9]((?P' + digital_numeral + ')[^0-9 .-]?-?(?P(?:x' + digital_numeral + '(?:' + sep + '?[x-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_x, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) + self.container.register_property(None, sep + r'((?P' + digital_numeral + ')' + sep + '' + '(?P(?:x' + sep + digital_numeral + '(?:' + sep + '[x-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_x, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) + self.container.register_property(None, r'((?P' + digital_numeral + ')' + '(?P(?:x' + digital_numeral + '(?:[x-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_x, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) + self.container.register_property(None, r'(' + season_markers_re.pattern + '(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*))', confidence=0.6, formatter={None: parse_numeral, 'season': season_parser}, validator=NoValidator()) + + self.container.register_property(None, r'((?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.6, formatter=parse_numeral) + self.container.register_property(None, r'(ep' + sep + r'?(?P' + digital_numeral + ')' + sep + '?)', confidence=0.7, formatter=parse_numeral) + self.container.register_property(None, r'(ep' + sep + r'?(?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.7, formatter=parse_numeral) + + + self.container.register_property(None, r'(' + episode_markers_re.pattern + '(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*))', confidence=0.6, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) + self.container.register_property(None, r'(' + episode_words_re.pattern + sep + '?(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*)' + sep + '?' + episode_words_re.pattern + '?)', confidence=0.8, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) + + self.container.register_property(None, r'(' + episode_markers_re.pattern + '(?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.6, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) + self.container.register_property(None, r'(' + episode_words_re.pattern + sep + '?(?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.8, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) + + + self.container.register_property('episodeNumber', r'^ ?(\d{2})' + sep, confidence=0.4, formatter=parse_numeral) + self.container.register_property('episodeNumber', r'^ ?(\d{2})' + sep, confidence=0.4, formatter=parse_numeral) + self.container.register_property('episodeNumber', r'^ ?0(\d{1,2})' + sep, confidence=0.4, formatter=parse_numeral) + self.container.register_property('episodeNumber', sep + r'(\d{2}) ?$', confidence=0.4, formatter=parse_numeral) + self.container.register_property('episodeNumber', sep + r'0(\d{1,2}) ?$', confidence=0.4, formatter=parse_numeral) + + self.container.register_property(None, r'((?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + ')(?:' + sep + '?(?:episodes?|eps?))?)', confidence=0.7, formatter=parse_numeral) + self.container.register_property(None, r'((?:episodes?|eps?)' + sep + '?(?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + '))', confidence=0.7, formatter=parse_numeral) + self.container.register_property(None, r'((?:seasons?|saisons?|s)' + sep + '?(?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + '))', confidence=0.7, formatter=parse_numeral) + self.container.register_property(None, r'((?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + ')' + sep + '?(?:seasons?|saisons?|s))', confidence=0.7, formatter=parse_numeral) + + self.container.register_canonical_properties('other', 'FiNAL', 'Complete', validator=WeakValidator()) + + self.container.register_property(None, r'[^0-9]((?P' + digital_numeral + ')[^0-9 .-]?-?(?PxAll))', confidence=1.0, formatter={None: parse_numeral, 'other': lambda x: 'Complete', 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) + + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + naming_opts.add_argument('-E', '--episode-prefer-number', action='store_true', dest='episode_prefer_number', default=False, + help='Guess "serie.213.avi" as the episodeNumber 213. Without this option, ' + 'it will be guessed as season 2, episodeNumber 13') + + def supported_properties(self): + return ['episodeNumber', 'season', 'episodeList', 'seasonList', 'episodeCount', 'seasonCount', 'version', 'other'] + + def guess_episodes_rexps(self, string, node=None, options=None): + found = self.container.find_properties(string, node, options) + return self.container.as_guess(found, string) + + def should_process(self, mtree, options=None): + return mtree.guess.get('type', '').startswith('episode') + + def process(self, mtree, options=None): + GuessFinder(self.guess_episodes_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_filetype.py b/libs/guessit/transfo/guess_filetype.py index 4279c0b0..0eb3475f 100644 --- a/libs/guessit/transfo/guess_filetype.py +++ b/libs/guessit/transfo/guess_filetype.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,182 +18,220 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import Guess -from guessit.patterns import (subtitle_exts, info_exts, video_exts, episode_rexps, - find_properties, compute_canonical_form) -from guessit.date import valid_year -from guessit.textutils import clean_string +from __future__ import absolute_import, division, print_function, unicode_literals + +import mimetypes import os.path import re -import mimetypes -import logging -log = logging.getLogger(__name__) +from guessit.guess import Guess +from guessit.patterns.extension import subtitle_exts, info_exts, video_exts +from guessit.transfo import TransformerException +from guessit.plugins.transformers import Transformer, get_transformer +from guessit.matcher import log_found_guess, found_guess, found_property -# List of well known movies and series, hardcoded because they cannot be -# guessed appropriately otherwise -MOVIES = [ 'OSS 117' ] -SERIES = [ 'Band of Brothers' ] -MOVIES = [ m.lower() for m in MOVIES ] -SERIES = [ s.lower() for s in SERIES ] +class GuessFiletype(Transformer): + def __init__(self): + Transformer.__init__(self, 200) -def guess_filetype(mtree, filetype): - # put the filetype inside a dummy container to be able to have the - # following functions work correctly as closures - # this is a workaround for python 2 which doesn't have the - # 'nonlocal' keyword (python 3 does have it) - filetype_container = [filetype] - other = {} - filename = mtree.string + # List of well known movies and series, hardcoded because they cannot be + # guessed appropriately otherwise + MOVIES = ['OSS 117'] + SERIES = ['Band of Brothers'] - def upgrade_episode(): - if filetype_container[0] == 'video': - filetype_container[0] = 'episode' - elif filetype_container[0] == 'subtitle': - filetype_container[0] = 'episodesubtitle' - elif filetype_container[0] == 'info': - filetype_container[0] = 'episodeinfo' + MOVIES = [m.lower() for m in MOVIES] + SERIES = [s.lower() for s in SERIES] - def upgrade_movie(): - if filetype_container[0] == 'video': - filetype_container[0] = 'movie' - elif filetype_container[0] == 'subtitle': - filetype_container[0] = 'moviesubtitle' - elif filetype_container[0] == 'info': - filetype_container[0] = 'movieinfo' + def guess_filetype(self, mtree, options=None): + options = options or {} - def upgrade_subtitle(): - if 'movie' in filetype_container[0]: - filetype_container[0] = 'moviesubtitle' - elif 'episode' in filetype_container[0]: - filetype_container[0] = 'episodesubtitle' + # put the filetype inside a dummy container to be able to have the + # following functions work correctly as closures + # this is a workaround for python 2 which doesn't have the + # 'nonlocal' keyword which we could use here in the upgrade_* functions + # (python 3 does have it) + filetype_container = [mtree.guess.get('type')] + other = {} + filename = mtree.string + + def upgrade_episode(): + if filetype_container[0] == 'subtitle': + filetype_container[0] = 'episodesubtitle' + elif filetype_container[0] == 'info': + filetype_container[0] = 'episodeinfo' + elif (not filetype_container[0] or + filetype_container[0] == 'video'): + filetype_container[0] = 'episode' + + def upgrade_movie(): + if filetype_container[0] == 'subtitle': + filetype_container[0] = 'moviesubtitle' + elif filetype_container[0] == 'info': + filetype_container[0] = 'movieinfo' + elif (not filetype_container[0] or + filetype_container[0] == 'video'): + filetype_container[0] = 'movie' + + def upgrade_subtitle(): + if filetype_container[0] == 'movie': + filetype_container[0] = 'moviesubtitle' + elif filetype_container[0] == 'episode': + filetype_container[0] = 'episodesubtitle' + elif not filetype_container[0]: + filetype_container[0] = 'subtitle' + + def upgrade_info(): + if filetype_container[0] == 'movie': + filetype_container[0] = 'movieinfo' + elif filetype_container[0] == 'episode': + filetype_container[0] = 'episodeinfo' + elif not filetype_container[0]: + filetype_container[0] = 'info' + + # look at the extension first + fileext = os.path.splitext(filename)[1][1:].lower() + if fileext in subtitle_exts: + upgrade_subtitle() + other = {'container': fileext} + elif fileext in info_exts: + upgrade_info() + other = {'container': fileext} + elif fileext in video_exts: + other = {'container': fileext} else: - filetype_container[0] = 'subtitle' + if fileext and not options.get('name_only'): + other = {'extension': fileext} + list(mtree.unidentified_leaves())[-1].guess = Guess(other) - def upgrade_info(): - if 'movie' in filetype_container[0]: - filetype_container[0] = 'movieinfo' - elif 'episode' in filetype_container[0]: - filetype_container[0] = 'episodeinfo' - else: - filetype_container[0] = 'info' + # check whether we are in a 'Movies', 'Tv Shows', ... folder + folder_rexps = [(r'Movies?', upgrade_movie), + (r'Films?', upgrade_movie), + (r'Tv[ _-]?Shows?', upgrade_episode), + (r'Series?', upgrade_episode), + (r'Episodes?', upgrade_episode)] + for frexp, upgrade_func in folder_rexps: + frexp = re.compile(frexp, re.IGNORECASE) + for pathgroup in mtree.children: + if frexp.match(pathgroup.value): + upgrade_func() + return filetype_container[0], other - def upgrade(type='unknown'): - if filetype_container[0] == 'autodetect': - filetype_container[0] = type + # check for a few specific cases which will unintentionally make the + # following heuristics confused (eg: OSS 117 will look like an episode, + # season 1, epnum 17, when it is in fact a movie) + fname = mtree.clean_string(filename).lower() + for m in self.MOVIES: + if m in fname: + self.log.debug('Found in exception list of movies -> type = movie') + upgrade_movie() + return filetype_container[0], other + for s in self.SERIES: + if s in fname: + self.log.debug('Found in exception list of series -> type = episode') + upgrade_episode() + return filetype_container[0], other - - # look at the extension first - fileext = os.path.splitext(filename)[1][1:].lower() - if fileext in subtitle_exts: - upgrade_subtitle() - other = { 'container': fileext } - elif fileext in info_exts: - upgrade_info() - other = { 'container': fileext } - elif fileext in video_exts: - upgrade(type='video') - other = { 'container': fileext } - else: - upgrade(type='unknown') - other = { 'extension': fileext } - - - - # check whether we are in a 'Movies', 'Tv Shows', ... folder - folder_rexps = [ (r'Movies?', upgrade_movie), - (r'Tv[ _-]?Shows?', upgrade_episode), - (r'Series', upgrade_episode) - ] - for frexp, upgrade_func in folder_rexps: - frexp = re.compile(frexp, re.IGNORECASE) - for pathgroup in mtree.children: - if frexp.match(pathgroup.value): - upgrade_func() - - # check for a few specific cases which will unintentionally make the - # following heuristics confused (eg: OSS 117 will look like an episode, - # season 1, epnum 17, when it is in fact a movie) - fname = clean_string(filename).lower() - for m in MOVIES: - if m in fname: - log.debug('Found in exception list of movies -> type = movie') - upgrade_movie() - for s in SERIES: - if s in fname: - log.debug('Found in exception list of series -> type = episode') - upgrade_episode() - - # now look whether there are some specific hints for episode vs movie - if filetype_container[0] in ('video', 'subtitle', 'info'): # if we have an episode_rexp (eg: s02e13), it is an episode - for rexp, _, _ in episode_rexps: - match = re.search(rexp, filename, re.IGNORECASE) - if match: - log.debug('Found matching regexp: "%s" (string = "%s") -> type = episode', rexp, match.group()) + episode_transformer = get_transformer('guess_episodes_rexps') + if episode_transformer: + filename_parts = list(x.value for x in mtree.unidentified_leaves()); + filename_parts.append(filename) + for filename_part in filename_parts: + guess = episode_transformer.guess_episodes_rexps(filename_part) + if guess: + self.log.debug('Found guess_episodes_rexps: %s -> type = episode', guess) + upgrade_episode() + return filetype_container[0], other + + properties_transformer = get_transformer('guess_properties') + if properties_transformer: + # if we have certain properties characteristic of episodes, it is an ep + found = properties_transformer.container.find_properties(filename, mtree, options, 'episodeFormat') + guess = properties_transformer.container.as_guess(found, filename) + if guess: + self.log.debug('Found characteristic property of episodes: %s"', guess) upgrade_episode() - break + return filetype_container[0], other - # if we have a 3-4 digit number that's not a year, maybe an episode - match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename) - if match: - fullnumber = int(match.group()[1:-1]) - #season = fullnumber // 100 - epnumber = fullnumber % 100 - possible = True + weak_episode_transformer = get_transformer('guess_weak_episodes_rexps') + if weak_episode_transformer: + found = properties_transformer.container.find_properties(filename, mtree, options, 'crc32') + guess = properties_transformer.container.as_guess(found, filename) + if guess: + found = weak_episode_transformer.container.find_properties(filename, mtree, options) + guess = weak_episode_transformer.container.as_guess(found, filename) + if guess: + self.log.debug('Found characteristic property of episodes: %s"', guess) + upgrade_episode() + return filetype_container[0], other - # check for validity - if epnumber > 40: - possible = False - if valid_year(fullnumber): - possible = False + found = properties_transformer.container.find_properties(filename, mtree, options, 'format') + guess = properties_transformer.container.as_guess(found, filename) + if guess and guess['format'] in ('HDTV', 'WEBRip', 'WEB-DL', 'DVB'): + # Use weak episodes only if TV or WEB source + weak_episode_transformer = get_transformer('guess_weak_episodes_rexps') + if weak_episode_transformer: + guess = weak_episode_transformer.guess_weak_episodes_rexps(filename) + if guess: + self.log.debug('Found guess_weak_episodes_rexps: %s -> type = episode', guess) + upgrade_episode() + return filetype_container[0], other - if possible: - log.debug('Found possible episode number: %s (from string "%s") -> type = episode', epnumber, match.group()) - upgrade_episode() + website_transformer = get_transformer('guess_website') + if website_transformer: + found = website_transformer.container.find_properties(filename, mtree, options, 'website') + guess = website_transformer.container.as_guess(found, filename) + if guess: + for namepart in ('tv', 'serie', 'episode'): + if namepart in guess['website']: + # origin-specific type + self.log.debug('Found characteristic property of episodes: %s', guess) + upgrade_episode() + return filetype_container[0], other - # if we have certain properties characteristic of episodes, it is an ep - for prop, value, _, _ in find_properties(filename): - log.debug('prop: %s = %s' % (prop, value)) - if prop == 'episodeFormat': - log.debug('Found characteristic property of episodes: %s = "%s"', prop, value) - upgrade_episode() - break + if filetype_container[0] in ('subtitle', 'info') or (not filetype_container[0] and fileext in video_exts): + # if no episode info found, assume it's a movie + self.log.debug('Nothing characteristic found, assuming type = movie') + upgrade_movie() - elif compute_canonical_form('format', value) == 'DVB': - log.debug('Found characteristic property of episodes: %s = "%s"', prop, value) - upgrade_episode() - break + if not filetype_container[0]: + self.log.debug('Nothing characteristic found, assuming type = unknown') + filetype_container[0] = 'unknown' - # origin-specific type - if 'tvu.org.ru' in filename: - log.debug('Found characteristic property of episodes: %s = "%s"', prop, value) - upgrade_episode() + return filetype_container[0], other - # if no episode info found, assume it's a movie - log.debug('Nothing characteristic found, assuming type = movie') - upgrade_movie() + def process(self, mtree, options=None): + """guess the file type now (will be useful later) + """ + filetype, other = self.guess_filetype(mtree, options) - filetype = filetype_container[0] - return filetype, other + mtree.guess.set('type', filetype, confidence=1.0) + log_found_guess(mtree.guess) + filetype_info = Guess(other, confidence=1.0) + # guess the mimetype of the filename + # TODO: handle other mimetypes not found on the default type_maps + # mimetypes.types_map['.srt']='text/subtitle' + mime, _ = mimetypes.guess_type(mtree.string, strict=False) + if mime is not None: + filetype_info.update({'mimetype': mime}, confidence=1.0) -def process(mtree, filetype='autodetect'): - filetype, other = guess_filetype(mtree, filetype) + node_ext = mtree.node_at((-1,)) + found_guess(node_ext, filetype_info) - mtree.guess.set('type', filetype, confidence=1.0) - log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess)) + if mtree.guess.get('type') in [None, 'unknown']: + if options.get('name_only'): + mtree.guess.set('type', 'movie', confidence=0.6) + else: + raise TransformerException(__name__, 'Unknown file type') - filetype_info = Guess(other, confidence=1.0) - # guess the mimetype of the filename - # TODO: handle other mimetypes not found on the default type_maps - # mimetypes.types_map['.srt']='text/subtitle' - mime, _ = mimetypes.guess_type(mtree.string, strict=False) - if mime is not None: - filetype_info.update({'mimetype': mime}, confidence=1.0) - - node_ext = mtree.node_at((-1,)) - node_ext.guess = filetype_info - log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess)) + def post_process(self, mtree, options=None): + # now look whether there are some specific hints for episode vs movie + # If we have a date and no year, this is a TV Show. + if 'date' in mtree.info and 'year' not in mtree.info and mtree.info.get('type') != 'episode': + mtree.guess['type'] = 'episode' + for type_leaves in mtree.leaves_containing('type'): + type_leaves.guess['type'] = 'episode' + for title_leaves in mtree.leaves_containing('title'): + title_leaves.guess.rename('title', 'series') \ No newline at end of file diff --git a/libs/guessit/transfo/guess_idnumber.py b/libs/guessit/transfo/guess_idnumber.py index 0e15af5c..30b63cbd 100644 --- a/libs/guessit/transfo/guess_idnumber.py +++ b/libs/guessit/transfo/guess_idnumber.py @@ -18,54 +18,62 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import SingleNodeGuesser -from guessit.patterns import find_properties +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder import re -import logging -log = logging.getLogger(__name__) +_DIGIT = 0 +_LETTER = 1 +_OTHER = 2 -def guess_properties(string): - try: - prop, value, pos, end = find_properties(string)[0] - return { prop: value }, (pos, end) - except IndexError: +class GuessIdnumber(Transformer): + def __init__(self): + Transformer.__init__(self, 220) + + def supported_properties(self): + return ['idNumber'] + + _idnum = re.compile(r'(?P[a-zA-Z0-9-]{20,})') # 1.0, (0, 0)) + + def guess_idnumber(self, string, node=None, options=None): + match = self._idnum.search(string) + if match is not None: + result = match.groupdict() + switch_count = 0 + switch_letter_count = 0; + letter_count = 0; + last_letter = None + + last = _LETTER + for c in result['idNumber']: + if c in '0123456789': + ci = _DIGIT + elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + ci = _LETTER + if c != last_letter: + switch_letter_count += 1 + last_letter = c + letter_count += 1 + else: + ci = _OTHER + + if ci != last: + switch_count += 1 + + last = ci + + switch_ratio = float(switch_count) / len(result['idNumber']) + letters_ratio = (float(switch_letter_count) / letter_count) if letter_count > 0 else 1 + + # only return the result as probable if we alternate often between + # char type (more likely for hash values than for common words) + if switch_ratio > 0.4 and letters_ratio > 0.4: + return result, match.span() + return None, None -_idnum = re.compile(r'(?P[a-zA-Z0-9-]{10,})') # 1.0, (0, 0)) - -def guess_idnumber(string): - match = _idnum.search(string) - if match is not None: - result = match.groupdict() - switch_count = 0 - DIGIT = 0 - LETTER = 1 - OTHER = 2 - last = LETTER - for c in result['idNumber']: - if c in '0123456789': - ci = DIGIT - elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': - ci = LETTER - else: - ci = OTHER - - if ci != last: - switch_count += 1 - - last = ci - - switch_ratio = float(switch_count) / len(result['idNumber']) - - # only return the result as probable if we alternate often between - # char type (more likely for hash values than for common words) - if switch_ratio > 0.4: - return result, match.span() - - return None, None - -def process(mtree): - SingleNodeGuesser(guess_idnumber, 0.4, log).process(mtree) + def process(self, mtree, options=None): + GuessFinder(self.guess_idnumber, 0.4, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_language.py b/libs/guessit/transfo/guess_language.py index 648a06b1..cb9787d3 100644 --- a/libs/guessit/transfo/guess_language.py +++ b/libs/guessit/transfo/guess_language.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,38 +18,169 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import Guess -from guessit.transfo import SingleNodeGuesser -from guessit.language import search_language -import logging +from __future__ import absolute_import, division, print_function, unicode_literals -log = logging.getLogger(__name__) +from guessit.language import search_language, subtitle_prefixes, subtitle_suffixes +from guessit.patterns.extension import subtitle_exts +from guessit.textutils import find_words +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder -def guess_language(string, node, skip=None): - if skip: - relative_skip = [] - for entry in skip: - node_idx = entry['node_idx'] - span = entry['span'] - if node_idx == node.node_idx[:len(node_idx)]: - relative_span = (span[0] - node.offset + 1, span[1] - node.offset + 1) - relative_skip.append(relative_span) - skip = relative_skip +class GuessLanguage(Transformer): + def __init__(self): + Transformer.__init__(self, 30) - language, span, confidence = search_language(string, skip=skip) - if language: - return (Guess({'language': language}, - confidence=confidence, - raw= string[span[0]:span[1]]), - span) + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + naming_opts.add_argument('-L', '--allowed-languages', action='append', dest='allowed_languages', + help='Allowed language (can be used multiple times)') - return None, None + def supported_properties(self): + return ['language', 'subtitleLanguage'] -guess_language.use_node = True + def guess_language(self, string, node=None, options=None): + allowed_languages = None + if options and 'allowed_languages' in options: + allowed_languages = options.get('allowed_languages') + guess = search_language(string, allowed_languages) + return guess + def _skip_language_on_second_pass(self, mtree, node): + """Check if found node is a valid language node, or if it's a false positive. -def process(mtree, *args, **kwargs): - SingleNodeGuesser(guess_language, None, log, *args, **kwargs).process(mtree) - # Note: 'language' is promoted to 'subtitleLanguage' in the post_process transfo + :param mtree: Tree detected on first pass. + :type mtree: :class:`guessit.matchtree.MatchTree` + :param node: Node that contains a language Guess + :type node: :class:`guessit.matchtree.MatchTree` + + :return: True if a second pass skipping this node is required + :rtype: bool + """ + unidentified_starts = {} + unidentified_ends = {} + + property_starts = {} + property_ends = {} + + title_starts = {} + title_ends = {} + + for unidentified_node in mtree.unidentified_leaves(): + unidentified_starts[unidentified_node.span[0]] = unidentified_node + unidentified_ends[unidentified_node.span[1]] = unidentified_node + + for property_node in mtree.leaves_containing('year'): + property_starts[property_node.span[0]] = property_node + property_ends[property_node.span[1]] = property_node + + for title_node in mtree.leaves_containing(['title', 'series']): + title_starts[title_node.span[0]] = title_node + title_ends[title_node.span[1]] = title_node + + return node.span[0] in title_ends.keys() and (node.span[1] in unidentified_starts.keys() or node.span[1] + 1 in property_starts.keys()) or\ + node.span[1] in title_starts.keys() and (node.span[0] == node.group_node().span[0] or node.span[0] in unidentified_ends.keys() or node.span[0] in property_ends.keys()) + + def second_pass_options(self, mtree, options=None): + m = mtree.matched() + to_skip_language_nodes = [] + + for lang_key in ('language', 'subtitleLanguage'): + langs = {} + lang_nodes = set(mtree.leaves_containing(lang_key)) + + for lang_node in lang_nodes: + lang = lang_node.guess.get(lang_key, None) + if self._skip_language_on_second_pass(mtree, lang_node): + # Language probably split the title. Add to skip for 2nd pass. + + # if filetype is subtitle and the language appears last, just before + # the extension, then it is likely a subtitle language + parts = mtree.clean_string(lang_node.root.value).split() + if m.get('type') in ['moviesubtitle', 'episodesubtitle']: + if lang_node.value in parts and \ + (parts.index(lang_node.value) == len(parts) - 2): + continue + to_skip_language_nodes.append(lang_node) + elif lang not in langs: + langs[lang] = lang_node + else: + # The same language was found. Keep the more confident one, + # and add others to skip for 2nd pass. + existing_lang_node = langs[lang] + to_skip = None + if (existing_lang_node.guess.confidence('language') >= + lang_node.guess.confidence('language')): + # lang_node is to remove + to_skip = lang_node + else: + # existing_lang_node is to remove + langs[lang] = lang_node + to_skip = existing_lang_node + to_skip_language_nodes.append(to_skip) + + if to_skip_language_nodes: + # Also skip same value nodes + skipped_values = [skip_node.value for skip_node in to_skip_language_nodes] + + for lang_key in ('language', 'subtitleLanguage'): + lang_nodes = set(mtree.leaves_containing(lang_key)) + + for lang_node in lang_nodes: + if lang_node not in to_skip_language_nodes and lang_node.value in skipped_values: + to_skip_language_nodes.append(lang_node) + return {'skip_nodes': to_skip_language_nodes} + return None + + def should_process(self, mtree, options=None): + options = options or {} + return options.get('language', True) + + def process(self, mtree, options=None): + GuessFinder(self.guess_language, None, self.log, options).process_nodes(mtree.unidentified_leaves()) + + def promote_subtitle(self, node): + if 'language' in node.guess: + node.guess.set('subtitleLanguage', node.guess['language'], + confidence=node.guess.confidence('language')) + del node.guess['language'] + + def post_process(self, mtree, options=None): + # 1- try to promote language to subtitle language where it makes sense + for node in mtree.nodes(): + if 'language' not in node.guess: + continue + + # - if we matched a language in a file with a sub extension and that + # the group is the last group of the filename, it is probably the + # language of the subtitle + # (eg: 'xxx.english.srt') + if (mtree.node_at((-1,)).value.lower() in subtitle_exts and + node == list(mtree.leaves())[-2]): + self.promote_subtitle(node) + + # - if we find in the same explicit group + # a subtitle prefix before the language, + # or a subtitle suffix after the language, + # then upgrade the language + explicit_group = mtree.node_at(node.node_idx[:2]) + group_str = explicit_group.value.lower() + + for sub_prefix in subtitle_prefixes: + if (sub_prefix in find_words(group_str) and + 0 <= group_str.find(sub_prefix) < (node.span[0] - explicit_group.span[0])): + self.promote_subtitle(node) + + for sub_suffix in subtitle_suffixes: + if (sub_suffix in find_words(group_str) and + (node.span[0] - explicit_group.span[0]) < group_str.find(sub_suffix)): + self.promote_subtitle(node) + + # - if a language is in an explicit group just preceded by "st", + # it is a subtitle language (eg: '...st[fr-eng]...') + try: + idx = node.node_idx + previous = list(mtree.node_at((idx[0], idx[1] - 1)).leaves())[-1] + if previous.value.lower()[-2:] == 'st': + self.promote_subtitle(node) + except IndexError: + pass diff --git a/libs/guessit/transfo/guess_movie_title_from_position.py b/libs/guessit/transfo/guess_movie_title_from_position.py index bcb42b45..671e4cb5 100644 --- a/libs/guessit/transfo/guess_movie_title_from_position.py +++ b/libs/guessit/transfo/guess_movie_title_from_position.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,157 +18,156 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import Guess -import unicodedata -import logging +from __future__ import absolute_import, division, print_function, unicode_literals -log = logging.getLogger(__name__) +from guessit.plugins.transformers import Transformer +from guessit.matcher import found_property +from guessit import u -def process(mtree): - def found_property(node, name, value, confidence): - node.guess = Guess({ name: value }, - confidence=confidence, - raw=value) - log.debug('Found with confidence %.2f: %s' % (confidence, node.guess)) +class GuessMovieTitleFromPosition(Transformer): + def __init__(self): + Transformer.__init__(self, -200) - def found_title(node, confidence): - found_property(node, 'title', node.clean_value, confidence) + def supported_properties(self): + return ['title'] - basename = mtree.node_at((-2,)) - all_valid = lambda leaf: len(leaf.clean_value) > 0 - basename_leftover = basename.unidentified_leaves(valid=all_valid) + def should_process(self, mtree, options=None): + options = options or {} + return not options.get('skip_title') and not mtree.guess.get('type', '').startswith('episode') - try: - folder = mtree.node_at((-3,)) - folder_leftover = folder.unidentified_leaves() - except ValueError: - folder = None - folder_leftover = [] - - log.debug('folder: %s' % folder_leftover) - log.debug('basename: %s' % basename_leftover) - - # specific cases: - # if we find the same group both in the folder name and the filename, - # it's a good candidate for title - if (folder_leftover and basename_leftover and - folder_leftover[0].clean_value == basename_leftover[0].clean_value): - - found_title(folder_leftover[0], confidence=0.8) - return - - # specific cases: - # if the basename contains a number first followed by an unidentified - # group, and the folder only contains 1 unidentified one, then we have - # a series - # ex: Millenium Trilogy (2009)/(1)The Girl With The Dragon Tattoo(2009).mkv - try: - series = folder_leftover[0] - filmNumber = basename_leftover[0] - title = basename_leftover[1] - - basename_leaves = basename.leaves() - - num = int(filmNumber.clean_value) - - log.debug('series: %s' % series.clean_value) - log.debug('title: %s' % title.clean_value) - if (series.clean_value != title.clean_value and - series.clean_value != filmNumber.clean_value and - basename_leaves.index(filmNumber) == 0 and - basename_leaves.index(title) == 1): - - found_title(title, confidence=0.6) - found_property(series, 'filmSeries', - series.clean_value, confidence=0.6) - found_property(filmNumber, 'filmNumber', - num, confidence=0.6) - return - except Exception: - pass - - # specific cases: - # - movies/tttttt (yyyy)/tttttt.ccc - try: - if mtree.node_at((-4, 0)).value.lower() == 'movies': - folder = mtree.node_at((-3,)) - - # Note:too generic, might solve all the unittests as they all - # contain 'movies' in their path - # - #if containing_folder.is_leaf() and not containing_folder.guess: - # containing_folder.guess = - # Guess({ 'title': clean_string(containing_folder.value) }, - # confidence=0.7) - - year_group = folder.first_leaf_containing('year') - groups_before = folder.previous_unidentified_leaves(year_group) - - found_title(groups_before[0], confidence=0.8) + def process(self, mtree, options=None): + """ + try to identify the remaining unknown groups by looking at their + position relative to other known elements + """ + if 'title' in mtree.info: return - except Exception: - pass + basename = mtree.node_at((-2,)) + all_valid = lambda leaf: len(leaf.clean_value) > 0 + basename_leftover = list(basename.unidentified_leaves(valid=all_valid)) - # if we have either format or videoCodec in the folder containing the file - # or one of its parents, then we should probably look for the title in - # there rather than in the basename - try: - props = mtree.previous_leaves_containing(mtree.children[-2], - [ 'videoCodec', 'format', - 'language' ]) - except IndexError: - props = [] + try: + folder = mtree.node_at((-3,)) + folder_leftover = list(folder.unidentified_leaves()) + except ValueError: + folder = None + folder_leftover = [] - if props: - group_idx = props[0].node_idx[0] - if all(g.node_idx[0] == group_idx for g in props): - # if they're all in the same group, take leftover info from there - leftover = mtree.node_at((group_idx,)).unidentified_leaves() + self.log.debug('folder: %s' % u(folder_leftover)) + self.log.debug('basename: %s' % u(basename_leftover)) - if leftover: - found_title(leftover[0], confidence=0.7) + # specific cases: + # if we find the same group both in the folder name and the filename, + # it's a good candidate for title + if folder_leftover and basename_leftover and folder_leftover[0].clean_value == basename_leftover[0].clean_value: + found_property(folder_leftover[0], 'title', confidence=0.8) + return + + # specific cases: + # if the basename contains a number first followed by an unidentified + # group, and the folder only contains 1 unidentified one, then we have + # a series + # ex: Millenium Trilogy (2009)/(1)The Girl With The Dragon Tattoo(2009).mkv + if len(folder_leftover) > 0 and len(basename_leftover) > 1: + series = folder_leftover[0] + film_number = basename_leftover[0] + title = basename_leftover[1] + + basename_leaves = list(basename.leaves()) + + num = None + try: + num = int(film_number.clean_value) + except ValueError: + pass + + if num: + self.log.debug('series: %s' % series.clean_value) + self.log.debug('title: %s' % title.clean_value) + if (series.clean_value != title.clean_value and + series.clean_value != film_number.clean_value and + basename_leaves.index(film_number) == 0 and + basename_leaves.index(title) == 1): + + found_property(title, 'title', confidence=0.6) + found_property(series, 'filmSeries', confidence=0.6) + found_property(film_number, 'filmNumber', num, confidence=0.6) return - # look for title in basename if there are some remaining undidentified - # groups there - if basename_leftover: - title_candidate = basename_leftover[0] + if folder: + year_group = folder.first_leaf_containing('year') + if year_group: + groups_before = folder.previous_unidentified_leaves(year_group) + if groups_before: + try: + node = next(groups_before) + found_property(node, 'title', confidence=0.8) + return + except StopIteration: + pass - # if basename is only one word and the containing folder has at least - # 3 words in it, we should take the title from the folder name - # ex: Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi - # ex: Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi <-- TODO: gets caught here? - if (title_candidate.clean_value.count(' ') == 0 and - folder_leftover and - folder_leftover[0].clean_value.count(' ') >= 2): + # if we have either format or videoCodec in the folder containing the + # file or one of its parents, then we should probably look for the title + # in there rather than in the basename + try: + props = list(mtree.previous_leaves_containing(mtree.children[-2], + ['videoCodec', + 'format', + 'language'])) + except IndexError: + props = [] - found_title(folder_leftover[0], confidence=0.7) + if props: + group_idx = props[0].node_idx[0] + if all(g.node_idx[0] == group_idx for g in props): + # if they're all in the same group, take leftover info from there + leftover = mtree.node_at((group_idx,)).unidentified_leaves() + try: + found_property(next(leftover), 'title', confidence=0.7) + return + except StopIteration: + pass + + # look for title in basename if there are some remaining unidentified + # groups there + if basename_leftover: + # if basename is only one word and the containing folder has at least + # 3 words in it, we should take the title from the folder name + # ex: Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi + # ex: Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi <-- TODO: gets caught here? + if (basename_leftover[0].clean_value.count(' ') == 0 and + folder_leftover and folder_leftover[0].clean_value.count(' ') >= 2): + + found_property(folder_leftover[0], 'title', confidence=0.7) + return + + # if there are only many unidentified groups, take the first of which is + # not inside brackets or parentheses. + # ex: Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi + if basename_leftover[0].is_explicit(): + for basename_leftover_elt in basename_leftover: + if not basename_leftover_elt.is_explicit(): + found_property(basename_leftover_elt, 'title', confidence=0.8) + return + + # if all else fails, take the first remaining unidentified group in the + # basename as title + found_property(basename_leftover[0], 'title', confidence=0.6) return - # if there are only 2 unidentified groups, the first of which is inside - # brackets or parentheses, we take the second one for the title: - # ex: Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi - if len(basename_leftover) == 2 and basename_leftover[0].is_explicit(): - found_title(basename_leftover[1], confidence=0.8) + # if there are no leftover groups in the basename, look in the folder name + if folder_leftover: + found_property(folder_leftover[0], 'title', confidence=0.5) return - # if all else fails, take the first remaining unidentified group in the - # basename as title - found_title(title_candidate, confidence=0.6) - return - - # if there are no leftover groups in the basename, look in the folder name - if folder_leftover: - found_title(folder_leftover[0], confidence=0.5) - return - - # if nothing worked, look if we have a very small group at the beginning - # of the basename - basename = mtree.node_at((-2,)) - basename_leftover = basename.unidentified_leaves(valid=lambda leaf: True) - if basename_leftover: - found_title(basename_leftover[0], confidence=0.4) - return + # if nothing worked, look if we have a very small group at the beginning + # of the basename + basename = mtree.node_at((-2,)) + basename_leftover = basename.unidentified_leaves(valid=lambda leaf: True) + try: + found_property(next(basename_leftover), 'title', confidence=0.4) + return + except StopIteration: + pass diff --git a/libs/guessit/transfo/guess_properties.py b/libs/guessit/transfo/guess_properties.py index 6c72dfd5..01aecddc 100644 --- a/libs/guessit/transfo/guess_properties.py +++ b/libs/guessit/transfo/guess_properties.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Rémi Alvergnat # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,21 +18,271 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import SingleNodeGuesser -from guessit.patterns import find_properties -import logging +from __future__ import absolute_import, division, print_function, unicode_literals -log = logging.getLogger(__name__) +from guessit.containers import PropertiesContainer, WeakValidator, LeavesValidator, QualitiesContainer, NoValidator, \ + ChainedValidator, DefaultValidator, OnlyOneValidator, LeftValidator, NeighborValidator +from guessit.patterns import sep, build_or_pattern +from guessit.patterns.extension import subtitle_exts, video_exts, info_exts +from guessit.patterns.numeral import numeral, parse_numeral +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder, found_property +import re -def guess_properties(string): - try: - prop, value, pos, end = find_properties(string)[0] - return { prop: value }, (pos, end) - except IndexError: - return None, None +class GuessProperties(Transformer): + def __init__(self): + Transformer.__init__(self, 35) + self.container = PropertiesContainer() + self.qualities = QualitiesContainer() -def process(mtree): - SingleNodeGuesser(guess_properties, 1.0, log).process(mtree) + def register_property(propname, props, **kwargs): + """props a dict of {value: [patterns]}""" + for canonical_form, patterns in props.items(): + if isinstance(patterns, tuple): + patterns2, pattern_kwarg = patterns + if kwargs: + current_kwarg = dict(kwargs) + current_kwarg.update(pattern_kwarg) + else: + current_kwarg = dict(pattern_kwarg) + current_kwarg['canonical_form'] = canonical_form + self.container.register_property(propname, *patterns2, **current_kwarg) + elif kwargs: + current_kwarg = dict(kwargs) + current_kwarg['canonical_form'] = canonical_form + self.container.register_property(propname, *patterns, **current_kwarg) + else: + self.container.register_property(propname, *patterns, canonical_form=canonical_form) + + def register_quality(propname, quality_dict): + """props a dict of {canonical_form: quality}""" + for canonical_form, quality in quality_dict.items(): + self.qualities.register_quality(propname, canonical_form, quality) + + register_property('container', {'mp4': ['MP4']}) + + # http://en.wikipedia.org/wiki/Pirated_movie_release_types + register_property('format', {'VHS': ['VHS', 'VHS-Rip'], + 'Cam': ['CAM', 'CAMRip', 'HD-CAM'], + #'Telesync': ['TELESYNC', 'PDVD'], + 'Telesync': (['TS', 'HD-TS'], {'confidence': 0.4}), + 'Workprint': ['WORKPRINT', 'WP'], + 'Telecine': ['TELECINE', 'TC'], + 'PPV': ['PPV', 'PPV-Rip'], # Pay Per View + 'TV': ['SD-TV', 'SD-TV-Rip', 'Rip-SD-TV', 'TV-Rip', 'Rip-TV'], + 'DVB': ['DVB-Rip', 'DVB', 'PD-TV'], + 'DVD': ['DVD', 'DVD-Rip', 'VIDEO-TS', 'DVD-R', 'DVD-9', 'DVD-5'], + 'HDTV': ['HD-TV', 'TV-RIP-HD', 'HD-TV-RIP'], + 'VOD': ['VOD', 'VOD-Rip'], + 'WEBRip': ['WEB-Rip'], + 'WEB-DL': ['WEB-DL', 'WEB-HD', 'WEB'], + 'HD-DVD': ['HD-(?:DVD)?-Rip', 'HD-DVD'], + 'BluRay': ['Blu-ray(?:-Rip)?', 'B[DR]', 'B[DR]-Rip', 'BD[59]', 'BD25', 'BD50'] + }) + + register_quality('format', {'VHS': -100, + 'Cam': -90, + 'Telesync': -80, + 'Workprint': -70, + 'Telecine': -60, + 'PPV': -50, + 'TV': -30, + 'DVB': -20, + 'DVD': 0, + 'HDTV': 20, + 'VOD': 40, + 'WEBRip': 50, + 'WEB-DL': 60, + 'HD-DVD': 80, + 'BluRay': 100 + }) + + register_property('screenSize', {'360p': ['(?:\d{3,}(?:\\|\/|x|\*))?360(?:i|p?x?)'], + '368p': ['(?:\d{3,}(?:\\|\/|x|\*))?368(?:i|p?x?)'], + '480p': ['(?:\d{3,}(?:\\|\/|x|\*))?480(?:i|p?x?)'], + #'480p': (['hr'], {'confidence': 0.2}), # duplicate dict key + '576p': ['(?:\d{3,}(?:\\|\/|x|\*))?576(?:i|p?x?)'], + '720p': ['(?:\d{3,}(?:\\|\/|x|\*))?720(?:i|p?x?)'], + '900p': ['(?:\d{3,}(?:\\|\/|x|\*))?900(?:i|p?x?)'], + '1080i': ['(?:\d{3,}(?:\\|\/|x|\*))?1080i'], + '1080p': ['(?:\d{3,}(?:\\|\/|x|\*))?1080p?x?'], + '4K': ['(?:\d{3,}(?:\\|\/|x|\*))?2160(?:i|p?x?)'] + }, + validator=ChainedValidator(DefaultValidator(), OnlyOneValidator())) + + class ResolutionValidator(object): + """Make sure our match is surrounded by separators, or by another entry""" + def validate(self, prop, string, node, match, entry_start, entry_end): + """ + span = _get_span(prop, match) + span = _trim_span(span, string[span[0]:span[1]]) + start, end = span + + sep_start = start <= 0 or string[start - 1] in sep + sep_end = end >= len(string) or string[end] in sep + start_by_other = start in entry_end + end_by_other = end in entry_start + if (sep_start or start_by_other) and (sep_end or end_by_other): + return True + return False + """ + return True + + _digits_re = re.compile('\d+') + + def resolution_formatter(value): + digits = _digits_re.findall(value) + return 'x'.join(digits) + + self.container.register_property('screenSize', '\d{3,4}-?[x\*]-?\d{3,4}', canonical_from_pattern=False, formatter=resolution_formatter, validator=ChainedValidator(DefaultValidator(), ResolutionValidator())) + + register_quality('screenSize', {'360p': -300, + '368p': -200, + '480p': -100, + '576p': 0, + '720p': 100, + '900p': 130, + '1080i': 180, + '1080p': 200, + '4K': 400 + }) + + _videoCodecProperty = {'Real': ['Rv\d{2}'], # http://en.wikipedia.org/wiki/RealVideo + 'Mpeg2': ['Mpeg2'], + 'DivX': ['DVDivX', 'DivX'], + 'XviD': ['XviD'], + 'h264': ['[hx]-264(?:-AVC)?', 'MPEG-4(?:-AVC)'], + 'h265': ['[hx]-265(?:-HEVC)?', 'HEVC'] + } + + register_property('videoCodec', _videoCodecProperty) + + register_quality('videoCodec', {'Real': -50, + 'Mpeg2': -30, + 'DivX': -10, + 'XviD': 0, + 'h264': 100, + 'h265': 150 + }) + + # http://blog.mediacoderhq.com/h264-profiles-and-levels/ + # http://fr.wikipedia.org/wiki/H.264 + self.container.register_property('videoProfile', 'BP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) + self.container.register_property('videoProfile', 'XP', 'EP', canonical_form='XP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) + self.container.register_property('videoProfile', 'MP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) + self.container.register_property('videoProfile', 'HP', 'HiP', canonical_form='HP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) + self.container.register_property('videoProfile', '10.?bit', 'Hi10P', canonical_form='10bit') + self.container.register_property('videoProfile', '8.?bit', canonical_form='8bit') + self.container.register_property('videoProfile', 'Hi422P', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) + self.container.register_property('videoProfile', 'Hi444PP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) + + register_quality('videoProfile', {'BP': -20, + 'XP': -10, + 'MP': 0, + 'HP': 10, + '10bit': 15, + 'Hi422P': 25, + 'Hi444PP': 35 + }) + + # has nothing to do here (or on filenames for that matter), but some + # releases use it and it helps to identify release groups, so we adapt + register_property('videoApi', {'DXVA': ['DXVA']}) + + register_property('audioCodec', {'MP3': ['MP3', 'LAME', 'LAME(?:\d)+-(?:\d)+'], + 'DolbyDigital': ['DD'], + 'AAC': ['AAC'], + 'AC3': ['AC3'], + 'Flac': ['FLAC'], + 'DTS': (['DTS'], {'validator': LeftValidator()}), + 'TrueHD': ['True-HD'] + }) + + register_quality('audioCodec', {'MP3': 10, + 'DolbyDigital': 30, + 'AAC': 35, + 'AC3': 40, + 'Flac': 45, + 'DTS': 60, + 'TrueHD': 70 + }) + + self.container.register_property('audioProfile', 'HD', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS'])) + self.container.register_property('audioProfile', 'HD-MA', canonical_form='HDMA', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS'])) + self.container.register_property('audioProfile', 'HE', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC'])) + self.container.register_property('audioProfile', 'LC', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC'])) + self.container.register_property('audioProfile', 'HQ', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AC3'])) + + register_quality('audioProfile', {'HD': 20, + 'HDMA': 50, + 'LC': 0, + 'HQ': 0, + 'HE': 20 + }) + + register_property('audioChannels', {'7.1': ['7[\W_]1', '7ch', '8ch'], + '5.1': ['5[\W_]1', '5ch', '6ch'], + '2.0': ['2[\W_]0', '2ch', 'stereo'], + '1.0': ['1[\W_]0', '1ch', 'mono'] + }) + + register_quality('audioChannels', {'7.1': 200, + '5.1': 100, + '2.0': 0, + '1.0': -100 + }) + + self.container.register_property('episodeFormat', r'Minisodes?', canonical_form='Minisode') + + self.container.register_property('crc32', '(?:[a-fA-F]|[0-9]){8}', enhance=False, canonical_from_pattern=False) + + weak_episode_words = ['pt', 'part'] + self.container.register_property(None, '(' + build_or_pattern(weak_episode_words) + sep + '?(?P' + numeral + '))[^0-9]', enhance=False, canonical_from_pattern=False, confidence=0.4, formatter=parse_numeral) + + register_property('other', {'AudioFix': ['Audio-Fix', 'Audio-Fixed'], + 'SyncFix': ['Sync-Fix', 'Sync-Fixed'], + 'DualAudio': ['Dual-Audio'], + 'WideScreen': ['ws', 'wide-screen'], + 'Netflix': ['Netflix', 'NF'] + }) + + self.container.register_property('other', 'Real', 'Fix', canonical_form='Proper', validator=NeighborValidator()) + self.container.register_property('other', 'Proper', 'Repack', 'Rerip', canonical_form='Proper') + self.container.register_property('other', 'Fansub', canonical_form='Fansub') + self.container.register_property('other', 'Fastsub', canonical_form='Fastsub') + self.container.register_property('other', '(?:Seasons?' + sep + '?)?Complete', canonical_form='Complete') + self.container.register_property('other', 'R5', 'RC', canonical_form='R5') + self.container.register_property('other', 'Pre-Air', 'Preair', canonical_form='Preair') + + self.container.register_canonical_properties('other', 'Screener', 'Remux', '3D', 'HD', 'mHD', 'HDLight', 'HQ', + 'DDC', + 'HR', 'PAL', 'SECAM', 'NTSC') + self.container.register_canonical_properties('other', 'Limited', 'Complete', 'Classic', 'Unrated', 'LiNE', 'Bonus', 'Trailer', validator=WeakValidator()) + + for prop in self.container.get_properties('format'): + self.container.register_property('other', prop.pattern + '(-?Scr(?:eener)?)', canonical_form='Screener') + + for exts in (subtitle_exts, info_exts, video_exts): + for container in exts: + self.container.register_property('container', container, confidence=0.3) + + def guess_properties(self, string, node=None, options=None): + found = self.container.find_properties(string, node, options) + return self.container.as_guess(found, string) + + def supported_properties(self): + return self.container.get_supported_properties() + + def process(self, mtree, options=None): + GuessFinder(self.guess_properties, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) + proper_count = 0 + for other_leaf in mtree.leaves_containing('other'): + if 'other' in other_leaf.info and 'Proper' in other_leaf.info['other']: + proper_count += 1 + if proper_count: + found_property(mtree, 'properCount', proper_count) + + def rate_quality(self, guess, *props): + return self.qualities.rate_quality(guess, *props) diff --git a/libs/guessit/transfo/guess_release_group.py b/libs/guessit/transfo/guess_release_group.py index b72c7368..646c7128 100644 --- a/libs/guessit/transfo/guess_release_group.py +++ b/libs/guessit/transfo/guess_release_group.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,69 +18,187 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import SingleNodeGuesser -from guessit.patterns import prop_multi, compute_canonical_form, _dash, _psep +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder, build_guess +from guessit.containers import PropertiesContainer +from guessit.patterns import sep +from guessit.guess import Guess +from guessit.textutils import strip_brackets import re -import logging - -log = logging.getLogger(__name__) - -def get_patterns(property_name): - return [ p.replace(_dash, _psep) for patterns in prop_multi[property_name].values() for p in patterns ] - -CODECS = get_patterns('videoCodec') -FORMATS = get_patterns('format') -VAPIS = get_patterns('videoApi') - -# RG names following a codec or format, with a potential space or dash inside the name -GROUP_NAMES = [ r'(?P' + codec + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' - for codec in CODECS ] -GROUP_NAMES += [ r'(?P' + fmt + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' - for fmt in FORMATS ] -GROUP_NAMES += [ r'(?P' + api + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' - for api in VAPIS ] - -GROUP_NAMES2 = [ r'\.(?P' + codec + r')-(?P.*?)(-(.*?))?[ \.]' - for codec in CODECS ] -GROUP_NAMES2 += [ r'\.(?P' + fmt + r')-(?P.*?)(-(.*?))?[ \.]' - for fmt in FORMATS ] -GROUP_NAMES2 += [ r'\.(?P' + vapi + r')-(?P.*?)(-(.*?))?[ \.]' - for vapi in VAPIS ] - -GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ] -GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ] - -def adjust_metadata(md): - return dict((property_name, compute_canonical_form(property_name, value) or value) - for property_name, value in md.items()) -def guess_release_group(string): - # first try to see whether we have both a known codec and a known release group - for rexp in GROUP_NAMES: - match = rexp.search(string) - while match: - metadata = match.groupdict() - # make sure this is an actual release group we caught - release_group = (compute_canonical_form('releaseGroup', metadata['releaseGroup']) or - compute_canonical_form('weakReleaseGroup', metadata['releaseGroup'])) - if release_group: - return adjust_metadata(metadata), (match.start(1), match.end(2)) +class GuessReleaseGroup(Transformer): + def __init__(self): + Transformer.__init__(self, -190) - # we didn't find anything conclusive, keep searching - match = rexp.search(string, match.span()[0]+1) + self.container = PropertiesContainer(canonical_from_pattern=False) + self._allowed_groupname_pattern = '[\w@#€£$&!\?]' + self._forbidden_groupname_lambda = [lambda elt: elt in ['rip', 'by', 'for', 'par', 'pour', 'bonus'], + lambda elt: self._is_number(elt)] + # If the previous property in this list, the match will be considered as safe + # and group name can contain a separator. + self.previous_safe_properties = ['videoCodec', 'format', 'videoApi', 'audioCodec', 'audioProfile', 'videoProfile', 'audioChannels', 'other'] + self.previous_safe_values = {'other': ['Complete']} + self.next_safe_properties = ['extension', 'website'] + self.next_safe_values = {'format': ['Telesync']} + self.container.sep_replace_char = '-' + self.container.canonical_from_pattern = False + self.container.enhance = True + self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+') + self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+-' + self._allowed_groupname_pattern + '+') + self.re_sep = re.compile('(' + sep + ')') - # pick anything as releaseGroup as long as we have a codec in front - # this doesn't include a potential dash ('-') ending the release group - # eg: [...].X264-HiS@SiLUHD-English.[...] - for rexp in GROUP_NAMES2: - match = rexp.search(string) - if match: - return adjust_metadata(match.groupdict()), (match.start(1), match.end(2)) + def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): + naming_opts.add_argument('-G', '--expected-group', action='append', dest='expected_group', + help='Expected release group (can be used multiple times)') - return None, None + def supported_properties(self): + return self.container.get_supported_properties() + def _is_number(self, s): + try: + int(s) + return True + except ValueError: + return False -def process(mtree): - SingleNodeGuesser(guess_release_group, 0.8, log).process(mtree) + def validate_group_name(self, guess): + val = guess['releaseGroup'] + if len(val) > 1: + checked_val = "" + forbidden = False + for elt in self.re_sep.split(val): # separators are in the list because of capturing group + if forbidden: + # Previous was forbidden, don't had separator + forbidden = False + continue + for forbidden_lambda in self._forbidden_groupname_lambda: + forbidden = forbidden_lambda(elt.lower()) + if forbidden: + if checked_val: + # Removing previous separator + checked_val = checked_val[0:len(checked_val) - 1] + break + if not forbidden: + checked_val += elt + + val = checked_val + if not val: + return False + if self.re_sep.match(val[-1]): + val = val[:len(val)-1] + if self.re_sep.match(val[0]): + val = val[1:] + guess['releaseGroup'] = val + forbidden = False + for forbidden_lambda in self._forbidden_groupname_lambda: + forbidden = forbidden_lambda(val.lower()) + if forbidden: + break + if not forbidden: + return True + return False + + def is_leaf_previous(self, leaf, node): + if leaf.span[1] <= node.span[0]: + for idx in range(leaf.span[1], node.span[0]): + if leaf.root.value[idx] not in sep: + return False + return True + return False + + def validate_next_leaves(self, node): + if 'series' in node.root.info or 'title' in node.root.info: + # --expected-series or --expected-title is used. + return True + + # Make sure to avoid collision with 'series' or 'title' guessed later. Should be more precise. + leaves = node.root.unidentified_leaves() + return len(list(leaves)) > 1 + + def validate_node(self, leaf, node, safe=False): + if not self.is_leaf_previous(leaf, node): + return False + if not self.validate_next_leaves(node): + return False + if safe: + for k, v in leaf.guess.items(): + if k in self.previous_safe_values and not v in self.previous_safe_values[k]: + return False + return True + + def guess_release_group(self, string, node=None, options=None): + if options and options.get('expected_group'): + expected_container = PropertiesContainer(enhance=True, canonical_from_pattern=False) + for expected_group in options.get('expected_group'): + if expected_group.startswith('re:'): + expected_group = expected_group[3:] + expected_group = expected_group.replace(' ', '-') + expected_container.register_property('releaseGroup', expected_group, enhance=True) + else: + expected_group = re.escape(expected_group) + expected_container.register_property('releaseGroup', expected_group, enhance=False) + + found = expected_container.find_properties(string, node, options, 'releaseGroup') + guess = expected_container.as_guess(found, string, self.validate_group_name) + if guess: + return guess + + found = self.container.find_properties(string, node, options, 'releaseGroup') + guess = self.container.as_guess(found, string, self.validate_group_name) + validated_guess = None + if guess: + group_node = node.group_node() + if group_node: + for leaf in group_node.leaves_containing(self.previous_safe_properties): + if self.validate_node(leaf, node, True): + if leaf.root.value[leaf.span[1]] == '-': + guess.metadata().confidence = 1 + else: + guess.metadata().confidence = 0.7 + validated_guess = guess + + if not validated_guess: + # If previous group last leaf is identified as a safe property, + # consider the raw value as a releaseGroup + previous_group_node = node.previous_group_node() + if previous_group_node: + for leaf in previous_group_node.leaves_containing(self.previous_safe_properties): + if self.validate_node(leaf, node, False): + guess = Guess({'releaseGroup': node.value}, confidence=1, input=node.value, span=(0, len(node.value))) + if self.validate_group_name(guess): + node.guess = guess + validated_guess = guess + + if validated_guess: + # If following group nodes have only one unidentified leaf, it belongs to the release group + next_group_node = node + + while True: + next_group_node = next_group_node.next_group_node() + if next_group_node: + leaves = list(next_group_node.leaves()) + if len(leaves) == 1 and not leaves[0].guess: + validated_guess['releaseGroup'] = validated_guess['releaseGroup'] + leaves[0].value + leaves[0].guess = validated_guess + else: + break + else: + break + + if not validated_guess and node.is_explicit() and node.node_last_idx == 0: # first node from group + validated_guess = build_guess(node, 'releaseGroup', value=node.value[1:len(node.value)-1]) + validated_guess.metadata().confidence = 0.4 + validated_guess.metadata().span = 1, len(node.value) + node.guess = validated_guess + + if validated_guess: + # Strip brackets + validated_guess['releaseGroup'] = strip_brackets(validated_guess['releaseGroup']) + + return validated_guess + + def process(self, mtree, options=None): + GuessFinder(self.guess_release_group, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_video_rexps.py b/libs/guessit/transfo/guess_video_rexps.py index 1b511f15..b1dca8ee 100644 --- a/libs/guessit/transfo/guess_video_rexps.py +++ b/libs/guessit/transfo/guess_video_rexps.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,33 +18,41 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import Guess -from guessit.transfo import SingleNodeGuesser -from guessit.patterns import video_rexps, sep -import re -import logging +from __future__ import absolute_import, division, print_function, \ + unicode_literals -log = logging.getLogger(__name__) +from guessit.patterns import _psep +from guessit.containers import PropertiesContainer +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder +from guessit.patterns.numeral import parse_numeral -def guess_video_rexps(string): - string = '-' + string + '-' - for rexp, confidence, span_adjust in video_rexps: - match = re.search(sep + rexp + sep, string, re.IGNORECASE) - if match: - metadata = match.groupdict() - # is this the better place to put it? (maybe, as it is at least - # the soonest that we can catch it) - if metadata.get('cdNumberTotal', -1) is None: - del metadata['cdNumberTotal'] - span = (match.start() + span_adjust[0], - match.end() + span_adjust[1] - 2) - return (Guess(metadata, confidence=confidence, raw=string[span[0]:span[1]]), - span) +class GuessVideoRexps(Transformer): + def __init__(self): + Transformer.__init__(self, 25) - return None, None + self.container = PropertiesContainer(canonical_from_pattern=False) + self.container.register_property(None, 'cd' + _psep + '(?P[0-9])(?:' + _psep + 'of' + _psep + '(?P[0-9]))?', confidence=1.0, enhance=False, global_span=True, formatter=parse_numeral) + self.container.register_property('cdNumberTotal', '([1-9])' + _psep + 'cds?', confidence=0.9, enhance=False, formatter=parse_numeral) -def process(mtree): - SingleNodeGuesser(guess_video_rexps, None, log).process(mtree) + self.container.register_property('bonusNumber', 'x([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral) + + self.container.register_property('filmNumber', 'f([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral) + + self.container.register_property('edition', 'collector', 'collector-edition', 'edition-collector', canonical_form='Collector Edition') + self.container.register_property('edition', 'special-edition', 'edition-special', canonical_form='Special Edition') + self.container.register_property('edition', 'criterion', 'criterion-edition', 'edition-criterion', canonical_form='Criterion Edition') + self.container.register_property('edition', 'deluxe', 'cdeluxe-edition', 'edition-deluxe', canonical_form='Deluxe Edition') + self.container.register_property('edition', 'director\'?s?-cut', 'director\'?s?-cut-edition', 'edition-director\'?s?-cut', canonical_form='Director\'s cut') + + def supported_properties(self): + return self.container.get_supported_properties() + + def guess_video_rexps(self, string, node=None, options=None): + found = self.container.find_properties(string, node, options) + return self.container.as_guess(found, string) + + def process(self, mtree, options=None): + GuessFinder(self.guess_video_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_weak_episodes_rexps.py b/libs/guessit/transfo/guess_weak_episodes_rexps.py index 18306b43..93d7a7bb 100644 --- a/libs/guessit/transfo/guess_weak_episodes_rexps.py +++ b/libs/guessit/transfo/guess_weak_episodes_rexps.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,45 +18,64 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit import Guess -from guessit.transfo import SingleNodeGuesser -from guessit.patterns import weak_episode_rexps +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder +from guessit.patterns import sep, build_or_pattern +from guessit.containers import PropertiesContainer, LeavesValidator, NoValidator, WeakValidator +from guessit.patterns.numeral import numeral, parse_numeral +from guessit.date import valid_year + import re -import logging - -log = logging.getLogger(__name__) -def guess_weak_episodes_rexps(string, node): - if 'episodeNumber' in node.root.info: - return None, None +class GuessWeakEpisodesRexps(Transformer): + def __init__(self): + Transformer.__init__(self, 15) - for rexp, span_adjust in weak_episode_rexps: - match = re.search(rexp, string, re.IGNORECASE) - if match: - metadata = match.groupdict() - span = (match.start() + span_adjust[0], - match.end() + span_adjust[1]) + of_separators = ['of', 'sur', '/', '\\'] + of_separators_re = re.compile(build_or_pattern(of_separators, escape=True), re.IGNORECASE) - epnum = int(metadata['episodeNumber']) - if epnum > 100: - season, epnum = epnum // 100, epnum % 100 - # episodes which have a season > 25 are most likely errors - # (Simpsons is at 23!) - if season > 25: - continue - return Guess({ 'season': season, - 'episodeNumber': epnum }, - confidence=0.6, raw=string[span[0]:span[1]]), span - else: - return Guess(metadata, confidence=0.3, raw=string[span[0]:span[1]]), span + self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) - return None, None + episode_words = ['episodes?'] + def _formater(episode_number): + epnum = parse_numeral(episode_number) + if not valid_year(epnum): + if epnum > 100: + season, epnum = epnum // 100, epnum % 100 + # episodes which have a season > 50 are most likely errors + # (Simpson is at 25!) + if season > 50: + return None + return {'season': season, 'episodeNumber': epnum} + else: + return epnum -guess_weak_episodes_rexps.use_node = True + self.container.register_property(['episodeNumber', 'season'], '[0-9]{2,4}', confidence=0.6, formatter=_formater, disabler=lambda options: options.get('episode_prefer_number') if options else False) + self.container.register_property(['episodeNumber', 'season'], '[0-9]{4}', confidence=0.6, formatter=_formater) + self.container.register_property('episodeNumber', '[^0-9](\d{1,3})', confidence=0.6, formatter=parse_numeral, disabler=lambda options: not options.get('episode_prefer_number') if options else True) + self.container.register_property(None, '(' + build_or_pattern(episode_words) + sep + '?(?P' + numeral + '))[^0-9]', confidence=0.4, formatter=parse_numeral) + self.container.register_property(None, r'(?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral +')', confidence=0.6, formatter=parse_numeral) + self.container.register_property('episodeNumber', r'^' + sep + '?(\d{1,3})' + sep, confidence=0.4, formatter=parse_numeral, disabler=lambda options: not options.get('episode_prefer_number') if options else True) + self.container.register_property('episodeNumber', sep + r'(\d{1,3})' + sep + '?$', confidence=0.4, formatter=parse_numeral, disabler=lambda options: not options.get('episode_prefer_number') if options else True) + def supported_properties(self): + return self.container.get_supported_properties() -def process(mtree): - SingleNodeGuesser(guess_weak_episodes_rexps, 0.6, log).process(mtree) + def guess_weak_episodes_rexps(self, string, node=None, options=None): + if node and 'episodeNumber' in node.root.info: + return None + + properties = self.container.find_properties(string, node, options) + guess = self.container.as_guess(properties, string) + + return guess + + def should_process(self, mtree, options=None): + return mtree.guess.get('type', '').startswith('episode') + + def process(self, mtree, options=None): + GuessFinder(self.guess_weak_episodes_rexps, 0.6, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_website.py b/libs/guessit/transfo/guess_website.py index acfd8e11..aa33226b 100644 --- a/libs/guessit/transfo/guess_website.py +++ b/libs/guessit/transfo/guess_website.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Rémi Alvergnat # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,22 +18,39 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import SingleNodeGuesser -from guessit.patterns import websites -import logging +from __future__ import absolute_import, division, print_function, unicode_literals +from guessit.patterns import build_or_pattern +from guessit.containers import PropertiesContainer +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder +from pkg_resources import resource_stream # @UnresolvedImport -log = logging.getLogger(__name__) +TLDS = [l.strip().decode('utf-8') + for l in resource_stream('guessit', 'tlds-alpha-by-domain.txt').readlines() + if b'--' not in l][1:] -def guess_website(string): - low = string.lower() - for site in websites: - pos = low.find(site.lower()) - if pos != -1: - return {'website': site}, (pos, pos + len(site)) - return None, None +class GuessWebsite(Transformer): + def __init__(self): + Transformer.__init__(self, 45) + self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) -def process(mtree): - SingleNodeGuesser(guess_website, 1.0, log).process(mtree) + tlds_pattern = build_or_pattern(TLDS) # All registered domain extension + safe_tlds_pattern = build_or_pattern(['com', 'org', 'net']) # For sure a website extension + safe_subdomains_pattern = build_or_pattern(['www']) # For sure a website subdomain + safe_prefix_tlds_pattern = build_or_pattern(['co', 'com', 'org', 'net']) # Those words before a tlds are sure + + self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)+' + r'(?:[a-z-]+\.)+' + r'(?:' + tlds_pattern + r')+') + self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_tlds_pattern + r')+') + self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_prefix_tlds_pattern + r'\.)+' + r'(?:' + tlds_pattern + r')+') + + def supported_properties(self): + return self.container.get_supported_properties() + + def guess_website(self, string, node=None, options=None): + found = self.container.find_properties(string, node, options, 'website') + return self.container.as_guess(found, string) + + def process(self, mtree, options=None): + GuessFinder(self.guess_website, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_year.py b/libs/guessit/transfo/guess_year.py index c193af7a..61363da5 100644 --- a/libs/guessit/transfo/guess_year.py +++ b/libs/guessit/transfo/guess_year.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,33 +18,40 @@ # along with this program. If not, see . # -from __future__ import unicode_literals -from guessit.transfo import SingleNodeGuesser -from guessit.date import search_year -import logging +from __future__ import absolute_import, division, print_function, unicode_literals -log = logging.getLogger(__name__) +from guessit.plugins.transformers import Transformer +from guessit.matcher import GuessFinder +from guessit.date import search_year, valid_year -def guess_year(string): - year, span = search_year(string) - if year: - return { 'year': year }, span - else: - return None, None +class GuessYear(Transformer): + def __init__(self): + Transformer.__init__(self, -160) -def guess_year_skip_first(string): - year, span = search_year(string) - if year: - year2, span2 = guess_year(string[span[1]:]) - if year2: - return year2, (span2[0]+span[1], span2[1]+span[1]) + def supported_properties(self): + return ['year'] - return None, None + def guess_year(self, string, node=None, options=None): + year, span = search_year(string) + if year: + return {'year': year}, span + else: + return None, None + def second_pass_options(self, mtree, options=None): + year_nodes = list(mtree.leaves_containing('year')) + if len(year_nodes) > 1: + return {'skip_nodes': year_nodes[:len(year_nodes) - 1]} + return None -def process(mtree, skip_first_year=False): - if skip_first_year: - SingleNodeGuesser(guess_year_skip_first, 1.0, log).process(mtree) - else: - SingleNodeGuesser(guess_year, 1.0, log).process(mtree) + def process(self, mtree, options=None): + GuessFinder(self.guess_year, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) + + # if we found a season number that is a valid year, it is usually safe to assume + # we can also set the year property to that value + for n in mtree.leaves_containing('season'): + g = n.guess + season = g['season'] + if valid_year(season): + g['year'] = season diff --git a/libs/guessit/transfo/post_process.py b/libs/guessit/transfo/post_process.py deleted file mode 100644 index 5920e3a4..00000000 --- a/libs/guessit/transfo/post_process.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import unicode_literals -from guessit.patterns import subtitle_exts -from guessit.textutils import reorder_title, find_words -import logging - -log = logging.getLogger(__name__) - - -def process(mtree): - # 1- try to promote language to subtitle language where it makes sense - for node in mtree.nodes(): - if 'language' not in node.guess: - continue - - def promote_subtitle(): - # pylint: disable=W0631 - node.guess.set('subtitleLanguage', node.guess['language'], - confidence=node.guess.confidence('language')) - del node.guess['language'] - - # - if we matched a language in a file with a sub extension and that - # the group is the last group of the filename, it is probably the - # language of the subtitle - # (eg: 'xxx.english.srt') - if (mtree.node_at((-1,)).value.lower() in subtitle_exts and - node == mtree.leaves()[-2]): - promote_subtitle() - - # - if we find the word 'sub' before the language, and in the same explicit - # group, then upgrade the language - explicit_group = mtree.node_at(node.node_idx[:2]) - group_str = explicit_group.value.lower() - - if ('sub' in find_words(group_str) and - 0 <= group_str.find('sub') < (node.span[0] - explicit_group.span[0])): - promote_subtitle() - - # - if a language is in an explicit group just preceded by "st", - # it is a subtitle language (eg: '...st[fr-eng]...') - try: - idx = node.node_idx - previous = mtree.node_at((idx[0], idx[1] - 1)).leaves()[-1] - if previous.value.lower()[-2:] == 'st': - promote_subtitle() - except IndexError: - pass - - # 2- ", the" at the end of a series title should be prepended to it - for node in mtree.nodes(): - if 'series' not in node.guess: - continue - - node.guess['series'] = reorder_title(node.guess['series']) diff --git a/libs/guessit/transfo/split_explicit_groups.py b/libs/guessit/transfo/split_explicit_groups.py index 7ae5787d..67d54cfb 100644 --- a/libs/guessit/transfo/split_explicit_groups.py +++ b/libs/guessit/transfo/split_explicit_groups.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,27 +18,32 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer from guessit.textutils import find_first_level_groups from guessit.patterns import group_delimiters -import functools -import logging - -log = logging.getLogger(__name__) +from functools import reduce -def process(mtree): - """return the string split into explicit groups, that is, those either - between parenthese, square brackets or curly braces, and those separated - by a dash.""" - for c in mtree.children: - groups = find_first_level_groups(c.value, group_delimiters[0]) - for delimiters in group_delimiters: - flatten = lambda l, x: l + find_first_level_groups(x, delimiters) - groups = functools.reduce(flatten, groups, []) +class SplitExplicitGroups(Transformer): + def __init__(self): + Transformer.__init__(self, 250) - # do not do this at this moment, it is not strong enough and can break other - # patterns, such as dates, etc... - #groups = functools.reduce(lambda l, x: l + x.split('-'), groups, []) + def process(self, mtree, options=None): + """split each of those into explicit groups (separated by parentheses or square brackets) - c.split_on_components(groups) + :return: return the string split into explicit groups, that is, those either + between parenthese, square brackets or curly braces, and those separated + by a dash.""" + for c in mtree.children: + groups = find_first_level_groups(c.value, group_delimiters[0]) + for delimiters in group_delimiters: + flatten = lambda l, x: l + find_first_level_groups(x, delimiters) + groups = reduce(flatten, groups, []) + + # do not do this at this moment, it is not strong enough and can break other + # patterns, such as dates, etc... + # groups = functools.reduce(lambda l, x: l + x.split('-'), groups, []) + + c.split_on_components(groups) diff --git a/libs/guessit/transfo/split_on_dash.py b/libs/guessit/transfo/split_on_dash.py index 031baff6..e86c6a3f 100644 --- a/libs/guessit/transfo/split_on_dash.py +++ b/libs/guessit/transfo/split_on_dash.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,25 +18,30 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer from guessit.patterns import sep import re -import logging - -log = logging.getLogger(__name__) -def process(mtree): - for node in mtree.unidentified_leaves(): - indices = [] +class SplitOnDash(Transformer): + def __init__(self): + Transformer.__init__(self, 245) - didx = 0 - pattern = re.compile(sep + '-' + sep) - match = pattern.search(node.value) - while match: - span = match.span() - indices.extend([ span[0], span[1] ]) - match = pattern.search(node.value, span[1]) + def process(self, mtree, options=None): + """split into '-' separated subgroups (with required separator chars + around the dash) + """ + for node in mtree.unidentified_leaves(): + indices = [] - if indices: - node.partition(indices) + pattern = re.compile(sep + '-' + sep) + match = pattern.search(node.value) + while match: + span = match.span() + indices.extend([span[0], span[1]]) + match = pattern.search(node.value, span[1]) + + if indices: + node.partition(indices) diff --git a/libs/guessit/transfo/split_path_components.py b/libs/guessit/transfo/split_path_components.py index 35fab405..c630a30c 100644 --- a/libs/guessit/transfo/split_path_components.py +++ b/libs/guessit/transfo/split_path_components.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2012 Nicolas Wack +# Copyright (c) 2013 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,19 +18,28 @@ # along with this program. If not, see . # -from __future__ import unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals + +from guessit.plugins.transformers import Transformer from guessit import fileutils -import os.path -import logging - -log = logging.getLogger(__name__) +from os.path import splitext -def process(mtree): - """Returns the filename split into [ dir*, basename, ext ].""" - components = fileutils.split_path(mtree.value) - basename = components.pop(-1) - components += list(os.path.splitext(basename)) - components[-1] = components[-1][1:] # remove the '.' from the extension +class SplitPathComponents(Transformer): + def __init__(self): + Transformer.__init__(self, 255) - mtree.split_on_components(components) + def process(self, mtree, options=None): + """first split our path into dirs + basename + ext + + :return: the filename split into [ dir*, basename, ext ] + """ + if not options.get('name_only'): + components = fileutils.split_path(mtree.value) + basename = components.pop(-1) + components += list(splitext(basename)) + components[-1] = components[-1][1:] # remove the '.' from the extension + + mtree.split_on_components(components) + else: + mtree.split_on_components([mtree.value, '']) diff --git a/libs/subliminal/providers/opensubtitles.py b/libs/subliminal/providers/opensubtitles.py index 4d2d14d0..795799d2 100644 --- a/libs/subliminal/providers/opensubtitles.py +++ b/libs/subliminal/providers/opensubtitles.py @@ -99,13 +99,15 @@ class OpenSubtitlesProvider(Provider): def no_operation(self): checked(self.server.NoOperation(self.token)) - def query(self, languages, hash=None, size=None, imdb_id=None, query=None): # @ReservedAssignment + def query(self, languages, hash=None, size=None, imdb_id=None, query=None, season=None, episode=None): # @ReservedAssignment searches = [] if hash and size: searches.append({'moviehash': hash, 'moviebytesize': str(size)}) if imdb_id: searches.append({'imdbid': imdb_id}) - if query: + if query and season and episode: + searches.append({'query': query, 'season': season, 'episode': episode}) + elif query: searches.append({'query': query}) if not searches: raise ValueError('One or more parameter missing') @@ -126,10 +128,16 @@ class OpenSubtitlesProvider(Provider): def list_subtitles(self, video, languages): query = None + season = None + episode = None if ('opensubtitles' not in video.hashes or not video.size) and not video.imdb_id: query = video.name.split(os.sep)[-1] + if isinstance(video, Episode): + query = video.series + season = video.season + episode = video.episode return self.query(languages, hash=video.hashes.get('opensubtitles'), size=video.size, imdb_id=video.imdb_id, - query=query) + query=query, season=season, episode=episode) def download_subtitle(self, subtitle): response = checked(self.server.DownloadSubtitles(self.token, [subtitle.id])) diff --git a/libs/subliminal/subtitle.py b/libs/subliminal/subtitle.py index d0786fc5..1ff7945d 100644 --- a/libs/subliminal/subtitle.py +++ b/libs/subliminal/subtitle.py @@ -51,7 +51,14 @@ class Subtitle(object): encodings.append('windows-1255') elif self.language.alpha3 == 'tur': encodings.extend(['iso-8859-9', 'windows-1254']) + elif self.language.alpha3 == 'pol': + # Eastern European Group 1 + encodings.extend(['windows-1250']) + elif self.language.alpha3 == 'bul': + # Eastern European Group 2 + encodings.extend(['windows-1251']) else: + # Western European (windows-1252) encodings.append('latin-1') # try to decode