import nltk
nltk.corpus.gutenberg.fileids()
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")
Displaying 25 of 37 matches: er father , was sometimes taken by surprize at his being still able to pity ` hem do the other any good ." " You surprize me ! Emma must do Harriet good : a Knightley actually looked red with surprize and displeasure , as he stood up , r . Elton , and found to his great surprize , that Mr . Elton was actually on d aid ." Emma saw Mrs . Weston ' s surprize , and felt that it must be great , father was quite taken up with the surprize of so sudden a journey , and his f y , in all the favouring warmth of surprize and conjecture . She was , moreove he appeared , to have her share of surprize , introduction , and pleasure . Th ir plans ; and it was an agreeable surprize to her , therefore , to perceive t talking aunt had taken me quite by surprize , it must have been the death of m f all the dialogue which ensued of surprize , and inquiry , and congratulation the present . They might chuse to surprize her ." Mrs . Cole had many to agre the mode of it , the mystery , the surprize , is more like a young woman ' s s to her song took her agreeably by surprize -- a second , slightly but correct " " Oh ! no -- there is nothing to surprize one at all .-- A pretty fortune ; t to be considered . Emma ' s only surprize was that Jane Fairfax should accep of your admiration may take you by surprize some day or other ." Mr . Knightle ation for her will ever take me by surprize .-- I never had a thought of her i expected by the best judges , for surprize -- but there was great joy . Mr . sound of at first , without great surprize . " So unreasonably early !" she w d Frank Churchill , with a look of surprize and displeasure .-- " That is easy ; and Emma could imagine with what surprize and mortification she must be retu tled that Jane should go . Quite a surprize to me ! I had not the least idea ! . It is impossible to express our surprize . He came to speak to his father o g engaged !" Emma even jumped with surprize ;-- and , horror - struck , exclai
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist(w.lower() for w in news_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
print(m + ':', fdist[m], end=' ')
can: 94 could: 87 may: 93 might: 38 must: 53 will: 389
mystery_text = brown.words(categories = 'mystery')
fdist = nltk.FreqDist(w.lower() for w in mystery_text)
modals = ['what', 'when', 'where', 'who', 'why']
for m in modals:
print(m + ':', fdist[m], end = ' ')
what: 146 when: 154 where: 71 who: 94 why: 52
from nltk.corpus import udhr
udhr.fileids() #'Filipino_Tagalog-Latin1'
['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8', 'Achehnese-Latin1', 'Achuar-Shiwiar-Latin1', 'Adja-UTF8', 'Afaan_Oromo_Oromiffa-Latin1', 'Afrikaans-Latin1', 'Aguaruna-Latin1', 'Akuapem_Twi-UTF8', 'Albanian_Shqip-Latin1', 'Amahuaca', 'Amahuaca-Latin1', 'Amarakaeri-Latin1', 'Amuesha-Yanesha-UTF8', 'Arabela-Latin1', 'Arabic_Alarabia-Arabic', 'Asante-UTF8', 'Ashaninca-Latin1', 'Asheninca-Latin1', 'Asturian_Bable-Latin1', 'Aymara-Latin1', 'Balinese-Latin1', 'Bambara-UTF8', 'Baoule-UTF8', 'Basque_Euskara-Latin1', 'Batonu_Bariba-UTF8', 'Belorus_Belaruski-Cyrillic', 'Belorus_Belaruski-UTF8', 'Bemba-Latin1', 'Bengali-UTF8', 'Beti-UTF8', 'Bichelamar-Latin1', 'Bikol_Bicolano-Latin1', 'Bora-Latin1', 'Bosnian_Bosanski-Cyrillic', 'Bosnian_Bosanski-Latin2', 'Bosnian_Bosanski-UTF8', 'Breton-Latin1', 'Bugisnese-Latin1', 'Bulgarian_Balgarski-Cyrillic', 'Bulgarian_Balgarski-UTF8', 'Cakchiquel-Latin1', 'Campa_Pajonalino-Latin1', 'Candoshi-Shapra-Latin1', 'Caquinte-Latin1', 'Cashibo-Cacataibo-Latin1', 'Cashinahua-Latin1', 'Catalan-Latin1', 'Catalan_Catala-Latin1', 'Cebuano-Latin1', 'Chamorro-Latin1', 'Chayahuita-Latin1', 'Chechewa_Nyanja-Latin1', 'Chickasaw-Latin1', 'Chinanteco-Ajitlan-Latin1', 'Chinanteco-UTF8', 'Chinese_Mandarin-GB2312', 'Chuuk_Trukese-Latin1', 'Cokwe-Latin1', 'Corsican-Latin1', 'Croatian_Hrvatski-Latin2', 'Czech-Latin2', 'Czech-UTF8', 'Czech_Cesky-Latin2', 'Czech_Cesky-UTF8', 'Dagaare-UTF8', 'Dagbani-UTF8', 'Dangme-UTF8', 'Danish_Dansk-Latin1', 'Dendi-UTF8', 'Ditammari-UTF8', 'Dutch_Nederlands-Latin1', 'Edo-Latin1', 'English-Latin1', 'Esperanto-UTF8', 'Estonian_Eesti-Latin1', 'Ewe_Eve-UTF8', 'Fante-UTF8', 'Faroese-Latin1', 'Farsi_Persian-UTF8', 'Farsi_Persian-v2-UTF8', 'Fijian-Latin1', 'Filipino_Tagalog-Latin1', 'Finnish_Suomi-Latin1', 'Fon-UTF8', 'French_Francais-Latin1', 'Frisian-Latin1', 'Friulian_Friulano-Latin1', 'Ga-UTF8', 'Gagauz_Gagauzi-UTF8', 'Galician_Galego-Latin1', 'Garifuna_Garifuna-Latin1', 'German_Deutsch-Latin1', 'Gonja-UTF8', 'Greek_Ellinika-Greek', 'Greek_Ellinika-UTF8', 'Greenlandic_Inuktikut-Latin1', 'Guarani-Latin1', 'Guen_Mina-UTF8', 'HaitianCreole_Kreyol-Latin1', 'HaitianCreole_Popular-Latin1', 'Hani-Latin1', 'Hausa_Haoussa-Latin1', 'Hawaiian-UTF8', 'Hebrew_Ivrit-Hebrew', 'Hebrew_Ivrit-UTF8', 'Hiligaynon-Latin1', 'Hindi-UTF8', 'Hindi_web-UTF8', 'Hmong_Miao-Sichuan-Guizhou-Yunnan-Latin1', 'Hmong_Miao-SouthernEast-Guizhou-Latin1', 'Hmong_Miao_Northern-East-Guizhou-Latin1', 'Hrvatski_Croatian-Latin2', 'Huasteco-Latin1', 'Huitoto_Murui-Latin1', 'Hungarian_Magyar-Latin1', 'Hungarian_Magyar-Latin2', 'Hungarian_Magyar-UTF8', 'Ibibio_Efik-Latin1', 'Icelandic_Yslenska-Latin1', 'Ido-Latin1', 'Igbo-UTF8', 'Iloko_Ilocano-Latin1', 'Indonesian-Latin1', 'Interlingua-Latin1', 'Inuktikut_Greenlandic-Latin1', 'IrishGaelic_Gaeilge-Latin1', 'Italian-Latin1', 'Italian_Italiano-Latin1', 'Japanese_Nihongo-EUC', 'Japanese_Nihongo-SJIS', 'Japanese_Nihongo-UTF8', 'Javanese-Latin1', 'Jola-Fogny_Diola-UTF8', 'Kabye-UTF8', 'Kannada-UTF8', 'Kaonde-Latin1', 'Kapampangan-Latin1', 'Kasem-UTF8', 'Kazakh-Cyrillic', 'Kazakh-UTF8', 'Kiche_Quiche-Latin1', 'Kicongo-Latin1', 'Kimbundu_Mbundu-Latin1', 'Kinyamwezi_Nyamwezi-Latin1', 'Kinyarwanda-Latin1', 'Kituba-Latin1', 'Korean_Hankuko-UTF8', 'Kpelewo-UTF8', 'Krio-UTF8', 'Kurdish-UTF8', 'Lamnso_Lam-nso-UTF8', 'Latin_Latina-Latin1', 'Latin_Latina-v2-Latin1', 'Latvian-Latin1', 'Limba-UTF8', 'Lingala-Latin1', 'Lithuanian_Lietuviskai-Baltic', 'Lozi-Latin1', 'Luba-Kasai_Tshiluba-Latin1', 'Luganda_Ganda-Latin1', 'Lunda_Chokwe-lunda-Latin1', 'Luvale-Latin1', 'Luxembourgish_Letzebuergeusch-Latin1', 'Macedonian-UTF8', 'Madurese-Latin1', 'Makonde-Latin1', 'Malagasy-Latin1', 'Malay_BahasaMelayu-Latin1', 'Maltese-UTF8', 'Mam-Latin1', 'Maninka-UTF8', 'Maori-Latin1', 'Mapudungun_Mapuzgun-Latin1', 'Mapudungun_Mapuzgun-UTF8', 'Marshallese-Latin1', 'Matses-Latin1', 'Mayan_Yucateco-Latin1', 'Mazahua_Jnatrjo-UTF8', 'Mazateco-Latin1', 'Mende-UTF8', 'Mikmaq_Micmac-Mikmaq-Latin1', 'Minangkabau-Latin1', 'Miskito_Miskito-Latin1', 'Mixteco-Latin1', 'Mongolian_Khalkha-Cyrillic', 'Mongolian_Khalkha-UTF8', 'Moore_More-UTF8', 'Nahuatl-Latin1', 'Ndebele-Latin1', 'Nepali-UTF8', 'Ngangela_Nyemba-Latin1', 'NigerianPidginEnglish-Latin1', 'Nomatsiguenga-Latin1', 'NorthernSotho_Pedi-Sepedi-Latin1', 'Norwegian-Latin1', 'Norwegian_Norsk-Bokmal-Latin1', 'Norwegian_Norsk-Nynorsk-Latin1', 'Nyanja_Chechewa-Latin1', 'Nyanja_Chinyanja-Latin1', 'Nzema-UTF8', 'OccitanAuvergnat-Latin1', 'OccitanLanguedocien-Latin1', 'Oromiffa_AfaanOromo-Latin1', 'Osetin_Ossetian-UTF8', 'Oshiwambo_Ndonga-Latin1', 'Otomi_Nahnu-Latin1', 'Paez-Latin1', 'Palauan-Latin1', 'Peuhl-UTF8', 'Picard-Latin1', 'Pipil-Latin1', 'Polish-Latin2', 'Polish_Polski-Latin2', 'Ponapean-Latin1', 'Portuguese_Portugues-Latin1', 'Pulaar-UTF8', 'Punjabi_Panjabi-UTF8', 'Purhepecha-UTF8', 'Qechi_Kekchi-Latin1', 'Quechua-Latin1', 'Quichua-Latin1', 'Rarotongan_MaoriCookIslands-Latin1', 'Rhaeto-Romance_Rumantsch-Latin1', 'Romani-Latin1', 'Romani-UTF8', 'Romanian-Latin2', 'Romanian_Romana-Latin2', 'Rukonzo_Konjo-Latin1', 'Rundi_Kirundi-Latin1', 'Runyankore-rukiga_Nkore-kiga-Latin1', 'Russian-Cyrillic', 'Russian-UTF8', 'Russian_Russky-Cyrillic', 'Russian_Russky-UTF8', 'Sami_Lappish-UTF8', 'Sammarinese-Latin1', 'Samoan-Latin1', 'Sango_Sangho-Latin1', 'Sanskrit-UTF8', 'Saraiki-UTF8', 'Sardinian-Latin1', 'ScottishGaelic_GaidhligAlbanach-Latin1', 'Seereer-UTF8', 'Serbian_Srpski-Cyrillic', 'Serbian_Srpski-Latin2', 'Serbian_Srpski-UTF8', 'Sharanahua-Latin1', 'Shipibo-Conibo-Latin1', 'Shona-Latin1', 'Sinhala-UTF8', 'Siswati-Latin1', 'Slovak-Latin2', 'Slovak_Slovencina-Latin2', 'Slovenian_Slovenscina-Latin2', 'SolomonsPidgin_Pijin-Latin1', 'Somali-Latin1', 'Soninke_Soninkanxaane-UTF8', 'Sorbian-Latin2', 'SouthernSotho_Sotho-Sesotho-Sutu-Sesutu-Latin1', 'Spanish-Latin1', 'Spanish_Espanol-Latin1', 'Sukuma-Latin1', 'Sundanese-Latin1', 'Sussu_Soussou-Sosso-Soso-Susu-UTF8', 'Swaheli-Latin1', 'Swahili_Kiswahili-Latin1', 'Swedish_Svenska-Latin1', 'Tahitian-UTF8', 'Tenek_Huasteco-Latin1', 'Tetum-Latin1', 'Themne_Temne-UTF8', 'Tiv-Latin1', 'Toba-UTF8', 'Tojol-abal-Latin1', 'TokPisin-Latin1', 'Tonga-Latin1', 'Tongan_Tonga-Latin1', 'Totonaco-Latin1', 'Trukese_Chuuk-Latin1', 'Turkish_Turkce-Turkish', 'Turkish_Turkce-UTF8', 'Tzeltal-Latin1', 'Tzotzil-Latin1', 'Uighur_Uyghur-Latin1', 'Uighur_Uyghur-UTF8', 'Ukrainian-Cyrillic', 'Ukrainian-UTF8', 'Umbundu-Latin1', 'Urarina-Latin1', 'Uzbek-Latin1', 'Vietnamese-ALRN-UTF8', 'Vietnamese-UTF8', 'Vlach-Latin1', 'Walloon_Wallon-Latin1', 'Wama-UTF8', 'Waray-Latin1', 'Wayuu-Latin1', 'Welsh_Cymraeg-Latin1', 'WesternSotho_Tswana-Setswana-Latin1', 'Wolof-Latin1', 'Xhosa-Latin1', 'Yagua-Latin1', 'Yao-Latin1', 'Yapese-Latin1', 'Yoruba-UTF8', 'Zapoteco-Latin1', 'Zapoteco-SanLucasQuiavini-Latin1', 'Zhuang-Latin1', 'Zulu-Latin1']
raw_text = udhr.raw('Filipino_Tagalog-Latin1')
nltk.FreqDist(raw_text).plot()
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
raw_text2 = udhr.raw('French_Francais-Latin1')
nltk.FreqDist(raw_text2).plot()
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
phrase = ['It', 'feels', 'right', 'to', 'me']
phrase
['It', 'feels', 'right', 'to', 'me']
phrase = ['It '+'feels '+'right '+'to '+'me ']
phrase
['It feels right to me ']
phrase = 'It feels right to me'
phrase
'It feels right to me'
(phrase + ' ') * 2
'It feels right to me It feels right to me '
phrase = ['It', 'feels', 'right', 'to', 'me']
phrase[2:]
['right', 'to', 'me']
phrase + ['.']
['It', 'feels', 'right', 'to', 'me', '.']
' '.join(['Monty', 'Python'])
'Monty Python'
' '.join(['It', 'feels', 'right', 'to', 'me', '.'])
'It feels right to me .'
phrase.index('right')
2
phrase.index('right') * 2
4
len(nltk.corpus.gutenberg.words('austen-persuasion.txt')) #tokens
98171
persuasion = nltk.corpus.gutenberg.words('austen-persuasion.txt')
len(set(persuasion)) #word types
6132
from nltk.corpus import brown
brown.categories()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
nltk.corpus.brown.words(categories = ['humor', 'mystery'])
['There', 'were', 'thirty-eight', 'patients', 'on', ...]
nltk.corpus.brown.words(categories = 'mystery')
['There', 'were', 'thirty-eight', 'patients', 'on', ...]
nltk.corpus.brown.words(categories = 'humor')
['It', 'was', 'among', 'these', 'that', 'Hinkle', ...]
from nltk.corpus import state_union
state_union.fileids()
['1945-Truman.txt', '1946-Truman.txt', '1947-Truman.txt', '1948-Truman.txt', '1949-Truman.txt', '1950-Truman.txt', '1951-Truman.txt', '1953-Eisenhower.txt', '1954-Eisenhower.txt', '1955-Eisenhower.txt', '1956-Eisenhower.txt', '1957-Eisenhower.txt', '1958-Eisenhower.txt', '1959-Eisenhower.txt', '1960-Eisenhower.txt', '1961-Kennedy.txt', '1962-Kennedy.txt', '1963-Johnson.txt', '1963-Kennedy.txt', '1964-Johnson.txt', '1965-Johnson-1.txt', '1965-Johnson-2.txt', '1966-Johnson.txt', '1967-Johnson.txt', '1968-Johnson.txt', '1969-Johnson.txt', '1970-Nixon.txt', '1971-Nixon.txt', '1972-Nixon.txt', '1973-Nixon.txt', '1974-Nixon.txt', '1975-Ford.txt', '1976-Ford.txt', '1977-Ford.txt', '1978-Carter.txt', '1979-Carter.txt', '1980-Carter.txt', '1981-Reagan.txt', '1982-Reagan.txt', '1983-Reagan.txt', '1984-Reagan.txt', '1985-Reagan.txt', '1986-Reagan.txt', '1987-Reagan.txt', '1988-Reagan.txt', '1989-Bush.txt', '1990-Bush.txt', '1991-Bush-1.txt', '1991-Bush-2.txt', '1992-Bush.txt', '1993-Clinton.txt', '1994-Clinton.txt', '1995-Clinton.txt', '1996-Clinton.txt', '1997-Clinton.txt', '1998-Clinton.txt', '1999-Clinton.txt', '2000-Clinton.txt', '2001-GWBush-1.txt', '2001-GWBush-2.txt', '2002-GWBush.txt', '2003-GWBush.txt', '2004-GWBush.txt', '2005-GWBush.txt', '2006-GWBush.txt']
state_union.count("men")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) /tmp/ipykernel_9137/346438498.py in <module> ----> 1 state_union.count("men") AttributeError: 'PlaintextCorpusReader' object has no attribute 'count'
s_u = state_union.words()
fdist = nltk.FreqDist(w.lower() for w in s_u)
modals = ['men', 'women', 'people']
for m in modals:
print(m + ':', fdist[m], end=' ')
men: 232 women: 145 people: 1313
s_u.count("men")
228
[fileid[:4] for fileid in state_union.fileids()]
['1945', '1946', '1947', '1948', '1949', '1950', '1951', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1963', '1964', '1965', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2001', '2002', '2003', '2004', '2005', '2006']
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in state_union.fileids()
for w in state_union.words(fileid)
for target in ['women', 'men', 'people']
if w.lower().startswith(target))
cfd.plot()
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
#use of word 'people' has increased
#use of word 'men' has progressively aligned on the use of word 'women'
from nltk.corpus import brown
day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
genre_word = [(genre, day)
for genre in ['news', 'romance']
for day in brown.words(categories=genre)]
cfd.tabulate(conditions=['news', 'romance'],
samples=day, cumulative=True)
Monday Tuesday Wednesday Thursday Friday Saturday Sunday news 54 97 119 139 180 213 264 romance 2 5 8 9 12 16 21
cfd.plot(conditions=['news', 'romance'],
samples=day, cumulative=True)
<AxesSubplot:xlabel='Samples', ylabel='Cumulative Counts'>