import nltk
from nltk.book import*
*** Introductory Examples for the NLTK Book *** Loading text1, ..., text9 and sent1, ..., sent9 Type the name of the text or sentence to view it. Type: 'texts()' or 'sents()' to list the materials. text1: Moby Dick by Herman Melville 1851 text2: Sense and Sensibility by Jane Austen 1811 text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal text8: Personals Corpus text9: The Man Who Was Thursday by G . K . Chesterton 1908
fdist1 = FreqDist(text1)
print(fdist1)
<FreqDist with 19317 samples and 260819 outcomes>
fdist1.most_common(50)
[(',', 18713), ('the', 13721), ('.', 6862), ('of', 6536), ('and', 6024), ('a', 4569), ('to', 4542), (';', 4072), ('in', 3916), ('that', 2982), ("'", 2684), ('-', 2552), ('his', 2459), ('it', 2209), ('I', 2124), ('s', 1739), ('is', 1695), ('he', 1661), ('with', 1659), ('was', 1632), ('as', 1620), ('"', 1478), ('all', 1462), ('for', 1414), ('this', 1280), ('!', 1269), ('at', 1231), ('by', 1137), ('but', 1113), ('not', 1103), ('--', 1070), ('him', 1058), ('from', 1052), ('be', 1030), ('on', 1005), ('so', 918), ('whale', 906), ('one', 889), ('you', 841), ('had', 767), ('have', 760), ('there', 715), ('But', 705), ('or', 697), ('were', 680), ('now', 646), ('which', 640), ('?', 637), ('me', 627), ('like', 624)]
fdist1.plot(50, cumulative=True)
<AxesSubplot:xlabel='Samples', ylabel='Cumulative Counts'>
fdist1.hapaxes()
['Herman', 'Melville', ']', 'ETYMOLOGY', 'Late', 'Consumptive', 'School', 'threadbare', 'lexicons', 'mockingly', 'flags', 'mortality', 'signification', 'HACKLUYT', 'Sw', 'HVAL', 'roundness', 'Dut', 'Ger', 'WALLEN', 'WALW', 'IAN', 'RICHARDSON', 'KETOS', 'GREEK', 'CETUS', 'LATIN', 'WHOEL', 'ANGLO', 'SAXON', 'WAL', 'HWAL', 'SWEDISH', 'ICELANDIC', 'BALEINE', 'BALLENA', 'FEGEE', 'ERROMANGOAN', 'Librarian', 'painstaking', 'burrower', 'grub', 'Vaticans', 'stalls', 'higgledy', 'piggledy', 'gospel', 'promiscuously', 'commentator', 'belongest', 'sallow', 'Pale', 'Sherry', 'loves', 'bluntly', 'Subs', 'thankless', 'Hampton', 'Court', 'hie', 'refugees', 'pampered', 'Michael', 'Raphael', 'unsplinterable', 'GENESIS', 'JOB', 'JONAH', 'punish', 'ISAIAH', 'soever', 'cometh', 'incontinently', 'perisheth', 'PLUTARCH', 'MORALS', 'breedeth', 'Whirlpooles', 'Balaene', 'arpens', 'PLINY', 'Scarcely', 'TOOKE', 'LUCIAN', 'TRUE', 'catched', 'OCTHER', 'VERBAL', 'TAKEN', 'MOUTH', 'ALFRED', '890', 'gudgeon', 'retires', 'MONTAIGNE', 'APOLOGY', 'RAIMOND', 'SEBOND', 'Nick', 'RABELAIS', 'cartloads', 'STOWE', 'ANNALS', 'LORD', 'BACON', 'Touching', 'ork', 'DEATH', 'sovereignest', 'bruise', 'HAMLET', 'leach', 'Mote', 'availle', 'returne', 'againe', 'worker', 'Dinting', 'paine', 'thro', 'maine', 'FAERIE', 'Immense', 'til', 'DAVENANT', 'PREFACE', 'GONDIBERT', 'spermacetti', 'Hosmannus', 'Nescio', 'VIDE', 'Spencer', 'Talus', 'flail', 'threatens', 'jav', 'lins', 'WALLER', 'SUMMER', 'ISLANDS', 'Commonwealth', 'Civitas', 'OPENING', 'SENTENCE', 'HOBBES', 'LEVIATHAN', 'Silly', 'Mansoul', 'chewing', 'sprat', 'PILGRIM', 'PROGRESS', 'Created', 'PARADISE', 'LOST', '---"', 'Hugest', 'Stretched', 'Draws', 'FULLLER', 'PROFANE', 'HOLY', 'STATE', 'DRYDEN', 'ANNUS', 'MIRABILIS', 'aground', 'EDGE', 'TEN', 'SPITZBERGEN', 'PURCHAS', 'wantonness', 'fuzzing', 'vents', 'HERBERT', 'INTO', 'ASIA', 'AFRICA', 'SCHOUTEN', 'SIXTH', 'CIRCUMNAVIGATION', 'Elbe', 'ducat', 'herrings', 'GREENLAND', 'Several', 'Fife', 'Anno', '1652', 'Pitferren', 'SIBBALD', 'FIFE', 'KINROSS', 'Myself', 'Sperma', 'ceti', 'fierceness', 'RICHARD', 'STRAFFORD', 'LETTER', 'BERMUDAS', 'PHIL', 'TRANS', '1668', 'PRIMER', 'COWLEY', '1729', '"...', 'frequendy', 'insupportable', 'disorder', 'ULLOA', 'SOUTH', 'AMERICA', 'sylphs', 'petticoat', 'Oft', 'Tho', 'RAPE', 'LOCK', 'NAT', 'wales', 'JOHNSON', 'COOK', 'dung', 'lime', 'juniper', 'UNO', 'VON', 'TROIL', 'LETTERS', 'BANKS', 'SOLANDER', '1772', 'Nantuckois', 'JEFFERSON', 'MEMORIAL', 'MINISTER', 'REFERENCE', 'PARLIAMENT', 'SOMEWHERE', 'guarding', 'protecting', 'robbers', 'BLACKSTONE', 'Rodmond', 'suspends', 'attends', 'FALCONER', 'Bright', 'roofs', 'domes', 'rockets', 'Around', 'unwieldy', 'COWPER', 'VISIT', 'LONDON', 'HUNTER', 'DISSECTION', 'SMALL', 'SIZED', 'aorta', 'gushing', 'PALEY', 'THEOLOGY', 'mammiferous', 'hind', 'BARON', 'CUVIER', 'COLNETT', 'PURPOSE', 'EXTENDING', 'SPERMACETI', 'Floundered', 'chace', 'peopling', 'Gather', 'Led', 'instincts', 'trackless', 'Assaulted', 'voracious', 'spiral', 'MONTGOMERY', 'WORLD', 'FLOOD', 'Paean', 'fatter', 'Flounders', 'CHARLES', 'LAMB', 'TRIUMPH', '1690', 'OBED', 'Susan', 'HAWTHORNE', 'TWICE', 'bespeak', 'raal', 'COOPER', 'PILOT', 'Berlin', 'Gazette', 'ECKERMANN', 'CONVERSATIONS', 'GOETHE', 'ESSEX', 'WAS', 'ATTACKED', 'FINALLY', 'DESTROYED', 'OWEN', 'CHACE', 'FIRST', 'SAID', 'VESSEL', 'YORK', '1821', 'piping', 'dimmed', 'phospher', 'ELIZABETH', 'OAKES', 'SMITH', 'amounted', '440', 'SCORESBY', 'Mad', 'agonies', 'endures', 'infuriated', 'rears', 'snaps', 'propelled', 'observers', 'opportunities', 'habitudes', 'BEALE', 'offensively', 'artful', 'mischievous', 'FREDERICK', 'DEBELL', '1840', 'October', 'Raise', 'ay', 'THAR', 'bowes', 'os', 'ROSS', 'ETCHINGS', 'CRUIZE', '1846', 'Globe', 'transactions', 'relate', 'HUSSEY', 'SURVIVORS', 'parried', 'MISSIONARY', 'JOURNAL', 'TYERMAN', 'boldest', 'persevering', 'REPORT', 'DANIEL', 'SPEECH', 'SENATE', 'APPLICATION', 'ERECTION', 'BREAKWATER', 'CAPTORS', 'WHALEMAN', 'ADVENTURES', 'BIOGRAPHY', 'GATHERED', 'HOMEWARD', 'COMMODORE', 'PREBLE', 'REV', 'CHEEVER', 'MUTINEER', 'BROTHER', 'ANOTHER', 'MCCULLOCH', 'COMMERCIAL', 'reciprocal', 'clews', 'SOMETHING', 'UNPUBLISHED', 'CURRENTS', 'Pedestrians', 'recollect', 'gateways', 'VOYAGER', 'ARCTIC', 'NEWSPAPER', 'TAKING', 'RETAKING', 'HOBOMACK', 'MIRIAM', 'FISHERMAN', 'appliance', 'RIBS', 'TRUCKS', 'Terra', 'Del', 'Fuego', 'DARWIN', 'NATURALIST', ";--'", '!\'"', 'WHARTON', 'Loomings', 'spleen', 'regulating', 'circulation', 'Whenever', 'drizzly', 'hypos', 'philosophical', 'Cato', 'Manhattoes', 'reefs', 'downtown', 'gazers', 'Circumambulate', 'Corlears', 'Coenties', 'Slip', 'Whitehall', 'Posted', 'sentinels', 'spiles', 'pier', 'lath', 'counters', 'desks', 'loitering', 'shady', 'Inlanders', 'lanes', 'alleys', 'attract', 'dale', 'dreamiest', 'shadiest', 'quietest', 'enchanting', 'Saco', 'crucifix', 'Deep', 'mazy', 'Tiger', 'Tennessee', 'Rockaway', 'Persians', 'deity', 'Narcissus', 'ungraspable', 'hazy', 'quarrelsome', 'offices', 'abominate', 'toils', 'trials', 'barques', 'schooners', 'broiling', 'buttered', 'judgmatically', 'peppered', 'reverentially', 'idolatrous', 'dotings', 'ibis', 'roasted', 'bake', 'plumb', 'Van', 'Rensselaers', 'Randolphs', 'Hardicanutes', 'lording', 'tallest', 'decoction', 'Seneca', 'Stoics', 'Testament', 'promptly', 'rub', 'infliction', 'BEING', 'PAID', 'urbane', 'ills', 'monied', 'consign', 'prevalent', 'violate', 'Pythagorean', 'commonalty', 'police', 'surveillance', 'programme', 'solo', 'CONTESTED', 'ELECTION', 'PRESIDENCY', 'UNITED', 'STATES', 'ISHMAEL', 'BLOODY', 'AFFGHANISTAN', 'managers', 'genteel', 'comedies', 'farces', 'cunningly', 'disguises', 'cajoling', 'unbiased', 'freewill', 'discriminating', 'overwhelming', 'undeliverable', 'itch', 'forbidden', 'ignoring', 'lodges', 'Carpet', 'Bag', 'Manhatto', 'candidates', 'penalties', 'Tyre', 'Carthage', 'imported', 'cobblestones', 'bitingly', 'shouldering', 'price', 'fervent', 'asphaltic', 'pavement', 'flinty', 'projections', 'soles', 'Too', 'cheapest', 'cheeriest', 'invitingly', 'particles', 'peer', 'Angel', 'Doom', 'wailing', 'gnashing', 'Wretched', 'entertainment', 'Moving', 'emigrant', 'poverty', 'creak', 'lodgings', 'zephyr', 'hob', 'toasting', 'observest', 'sashless', 'glazier', 'reasonest', 'chinks', 'crannies', 'lint', 'chattering', 'shiverings', 'cob', 'redder', 'Orion', 'glitters', 'conservatories', 'president', 'temperance', 'blubbering', 'straggling', 'wainscots', 'reminding', 'oilpainting', 'besmoked', 'defaced', 'unequal', 'crosslights', 'hags', 'delineate', 'bewitched', 'ponderings', 'boggy', 'soggy', 'squitchy', 'froze', 'heath', 'icebound', 'represents', 'Horner', 'foundered', 'clubs', 'harvesting', 'hacking', 'horrifying', 'Mixed', 'Nathan', 'Swain', 'corkscrew', 'Blanco', 'sojourning', 'fireplaces', 'duskier', 'cockpits', 'rarities', 'Projecting', 'Within', 'shelves', 'flasks', 'bustles', 'deliriums', 'Abominable', 'tumblers', 'cylinders', 'goggling', 'deceitfully', 'tapered', 'Parallel', 'pecked', 'footpads', 'Fill', 'shilling', 'examining', 'SKRIMSHANDER', 'accommodated', 'unoccupied', 'haint', 'pose', 'whalin', 'decidedly', 'objectionable', 'wander', 'Battery', 'ruminating', 'adorning', 'potatoes', 'sartainty', 'diabolically', 'steaks', 'undress', 'looker', 'rioting', 'Grampus', 'seed', 'Feegees', 'tramping', 'Enveloped', 'bedarned', 'eruption', 'officiating', 'brimmers', 'complained', 'potion', 'colds', 'catarrhs', 'liquor', 'arrantest', 'topers', 'obstreperously', 'aloof', 'desirous', 'hilarity', 'coffer', 'Southerner', 'mountaineers', 'Alleghanian', 'missed', 'supernaturally', 'congratulate', 'multiply', 'bachelor', 'abominated', 'tidiest', 'bedwards', 'shan', 'tablecloth', 'Skrimshander', 'bump', 'spraining', 'eider', 'yoking', 'rickety', 'whirlwinds', 'knockings', 'dismissed', 'popped', 'cherishing', 'chuckled', 'chuckle', 'mightily', 'catches', 'bamboozingly', 'overstocked', 'toothpick', 'rayther', 'BROWN', 'slanderin', 'farrago', 'BROKE', 'Sartain', 'Mt', 'Hecla', 'persist', 'mystifying', 'unsay', 'criminal', 'Wall', 'purty', 'sarmon', 'rips', 'tellin', 'bought', 'balmed', 'curios', 'sellin', 'inions', 'fooling', 'idolators', 'Depend', 'reg', 'lar', 'spliced', 'Johnny', 'sprawling', 'Arter', 'glim', 'jiffy', 'irresolute', 'vum', 'WON', 'Folding', 'scrutiny', 'porcupine', 'moccasin', 'ponchos', 'parade', 'rainy', 'remembering', 'commended', 'cobs', 'Nod', 'footfall', 'unlacing', 'blackish', 'plasters', 'inkling', 'Placing', 'crammed', 'scalp', 'mildewed', 'Ignorance', 'parent', 'nonplussed', 'undressing', 'checkered', 'Thirty', 'frogs', 'quaked', 'wrapall', 'dreadnaught', 'fumbled', 'Remembering', 'manikin', 'tenpin', 'andirons', 'jambs', 'bricks', 'appropriate', 'applying', 'hastier', 'withdrawals', 'antics', 'devotee', 'extinguishing', 'unceremoniously', 'bagged', 'sportsman', 'woodcock', 'uncomfortableness', 'deliberating', 'puffed', 'sang', 'Stammering', 'conjured', 'responses', 'debel', 'flourishing', 'Angels', 'flourishings', 'peddlin', 'sleepe', 'grunted', 'gettee', 'motioning', 'comely', 'insured', 'Counterpane', 'parti', 'triangles', 'interminable', 'caper', 'supperless', '21st', 'hemisphere', 'sigh', 'Sixteen', 'ached', 'coaches', 'stockinged', 'slippering', 'misbehaviour', 'unendurable', 'stepmothers', 'misfortunes', 'steeped', 'shudderingly', 'confounding', 'soberly', 'recurred', 'predicament', 'unlock', 'bridegroom', 'clasp', 'hugged', 'rouse', 'snore', 'scratch', 'Throwing', 'expostulations', 'unbecomingness', 'matrimonial', 'dawning', 'overture', 'innate', 'compliment', 'civility', 'rudeness', 'toilette', 'dressing', 'donning', 'gaspings', 'booting', 'caterpillar', 'outlandishness', 'manners', 'education', 'undergraduate', 'dreamt', 'cowhide', 'pinched', 'curtains', 'indecorous', 'contented', 'restricting', 'donned', 'lathering', 'unsheathes', 'whets', 'Rogers', 'cutlery', 'Afterwards', 'baton', 'Breakfast', 'pleasantly', 'bountifully', 'laughable', 'bosky', 'unshorn', 'gowns', 'toasted', 'lingers', 'tarried', 'barred', 'Grub', 'Park', 'assurance', 'polish', 'occasioned', 'embarrassed', 'bashfulness', 'duelled', 'winking', 'tastes', 'sheepishly', 'bashful', 'icicle', 'admirer', 'cordially', 'grappling', 'genteelly', 'eschewed', 'undivided', '6', 'circulating', 'nondescripts', 'Chestnut', 'jostle', 'Regent', 'Lascars', 'Bombay', 'Apollo', 'Feegeeans', 'Tongatobooarrs', 'Erromanggoans', 'Pannangians', 'Brighggians', 'weekly', 'Vermonters', 'stalwart', 'frames', 'felled', 'strutting', 'wester', 'bombazine', 'cloak', 'mow', 'gloves', 'joins', 'outfit', 'waistcoats', 'Hay', 'Seed', 'tract', 'dearest', 'pave', 'eggs', 'patrician', 'parks', 'scraggy', 'scoria', 'Herr', 'dowers', 'nieces', 'reservoirs', 'maples', 'bountiful', 'proffer', 'passer', 'cones', 'blossoms', 'superinduced', 'carnation', 'Salem', 'sweethearts', 'Puritanic', 'Whaleman', 'Wrapping', 'Each', 'quote', 'TALBOT', 'Near', 'Desolation', '1st', 'SISTER', 'ROBERT', 'WILLIS', 'ELLERY', 'NATHAN', 'COLEMAN', 'WALTER', 'CANNY', 'SETH', 'GLEIG', 'Forming', 'ELIZA', '31st', 'MARBLE', 'SHIPMATES', 'EZEKIEL', 'HARDY', 'AUGUST', '3d', '1833', 'WIDOW', 'Shaking', 'glazed', 'Affected', 'relatives', 'unhealing', 'sympathetically', 'wounds', 'bleed', 'blanks', ...]
V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)
['CIRCUMNAVIGATION', 'Physiognomically', 'apprehensiveness', 'cannibalistically', 'characteristically', 'circumnavigating', 'circumnavigation', 'circumnavigations', 'comprehensiveness', 'hermaphroditical', 'indiscriminately', 'indispensableness', 'irresistibleness', 'physiognomically', 'preternaturalness', 'responsibilities', 'simultaneousness', 'subterraneousness', 'supernaturalness', 'superstitiousness', 'uncomfortableness', 'uncompromisedness', 'undiscriminating', 'uninterpenetratingly']
fdist5 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7)
#why [] comes after fdist5? not ()?
#this code can be a breaktrough to me and Al's struggle
['#14-19teens', '#talkcity_adults', '((((((((((', '........', 'Question', 'actually', 'anything', 'computer', 'cute.-ass', 'everyone', 'football', 'innocent', 'listening', 'remember', 'seriously', 'something', 'together', 'tomorrow', 'watching']
list(bigrams(['more', 'is', 'said', 'than', 'done']))
[('more', 'is'), ('is', 'said'), ('said', 'than'), ('than', 'done')]
text4.collocations()
United States; fellow citizens; four years; years ago; Federal Government; General Government; American people; Vice President; God bless; Chief Justice; Old World; Almighty God; Fellow citizens; Chief Magistrate; every citizen; one another; fellow Americans; Indian tribes; public debt; foreign nations
text5.collocations()
wanna chat; PART JOIN; MODE #14-19teens; JOIN PART; PART PART; cute.-ass MP3; MP3 player; JOIN JOIN; times .. .; ACTION watches; guys wanna; song lasts; last night; ACTION sits; -...)...- S.M.R.; Lime Player; Player 12%; dont know; lez gurls; long time
text2.collocations()
Colonel Brandon; Sir John; Lady Middleton; Miss Dashwood; every thing; thousand pounds; dare say; Miss Steeles; said Elinor; Miss Steele; every body; John Dashwood; great deal; Harley Street; Berkeley Street; Miss Dashwoods; young man; Combe Magna; every day; next morning
text8.collocations()
would like; medium build; social drinker; quiet nights; non smoker; long term; age open; Would like; easy going; financially secure; fun times; similar interests; Age open; weekends away; poss rship; well presented; never married; single mum; permanent relationship; slim build
[len(w) for w in text1]
[1, 4, 4, 2, 6, 8, 4, 1, 9, 1, 1, 8, 2, 1, 4, 11, 5, 2, 1, 7, 6, 1, 3, 4, 5, 2, 10, 2, 4, 1, 5, 1, 4, 1, 3, 5, 1, 1, 3, 3, 3, 1, 2, 3, 4, 7, 3, 3, 8, 3, 8, 1, 4, 1, 5, 12, 1, 9, 11, 4, 3, 3, 3, 5, 2, 3, 3, 5, 7, 2, 3, 5, 1, 2, 5, 2, 4, 3, 3, 8, 1, 2, 7, 6, 8, 3, 2, 3, 9, 1, 1, 5, 3, 4, 2, 4, 2, 6, 6, 1, 3, 2, 5, 4, 2, 4, 4, 1, 5, 1, 4, 2, 2, 2, 6, 2, 3, 6, 7, 3, 1, 7, 9, 1, 3, 6, 1, 1, 5, 6, 5, 6, 3, 13, 2, 3, 4, 1, 3, 7, 4, 5, 2, 3, 4, 2, 2, 8, 1, 5, 1, 3, 2, 1, 3, 3, 1, 4, 1, 4, 6, 2, 5, 4, 9, 2, 7, 1, 3, 2, 3, 1, 5, 2, 6, 2, 7, 2, 2, 7, 1, 1, 10, 1, 5, 1, 3, 2, 2, 4, 11, 4, 3, 3, 1, 3, 3, 1, 6, 1, 1, 1, 1, 1, 4, 1, 3, 1, 2, 4, 1, 2, 6, 2, 2, 10, 1, 1, 10, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 6, 1, 3, 1, 5, 1, 4, 1, 7, 1, 5, 1, 9, 1, 5, 1, 7, 1, 7, 1, 6, 1, 7, 1, 7, 1, 5, 1, 4, 1, 4, 1, 5, 1, 5, 1, 4, 1, 4, 1, 11, 1, 8, 1, 8, 2, 1, 3, 1, 3, 1, 9, 2, 2, 4, 2, 4, 4, 4, 4, 11, 8, 3, 4, 1, 4, 2, 1, 4, 5, 2, 1, 3, 1, 3, 7, 2, 4, 4, 7, 3, 4, 8, 3, 6, 1, 6, 2, 3, 5, 1, 7, 2, 8, 6, 9, 2, 6, 2, 5, 7, 4, 2, 3, 4, 10, 1, 6, 2, 7, 1, 9, 3, 4, 3, 1, 2, 5, 4, 2, 5, 1, 4, 3, 8, 1, 8, 5, 10, 1, 7, 9, 1, 2, 5, 8, 1, 3, 9, 6, 8, 1, 3, 4, 2, 1, 2, 8, 3, 7, 7, 9, 1, 2, 4, 2, 3, 5, 4, 9, 1, 5, 8, 3, 6, 8, 2, 12, 1, 2, 9, 1, 8, 4, 1, 1, 3, 4, 2, 4, 3, 4, 13, 4, 1, 7, 1, 7, 1, 3, 4, 2, 9, 1, 2, 4, 7, 3, 11, 1, 9, 3, 3, 1, 2, 4, 4, 4, 1, 4, 5, 2, 1, 3, 1, 3, 1, 5, 11, 1, 2, 1, 4, 9, 2, 4, 8, 1, 6, 5, 5, 2, 4, 2, 4, 5, 4, 4, 4, 1, 3, 3, 4, 4, 4, 6, 5, 2, 3, 4, 1, 6, 1, 3, 4, 4, 3, 9, 5, 2, 3, 1, 3, 4, 4, 1, 8, 1, 3, 1, 3, 4, 9, 4, 5, 1, 3, 3, 2, 4, 7, 1, 4, 4, 4, 3, 5, 7, 1, 3, 2, 3, 10, 10, 7, 2, 4, 2, 2, 1, 3, 1, 4, 1, 3, 2, 3, 4, 3, 4, 5, 2, 4, 2, 6, 3, 5, 1, 2, 2, 4, 3, 4, 5, 2, 3, 4, 2, 9, 1, 5, 4, 1, 5, 5, 3, 7, 5, 3, 3, 9, 3, 2, 1, 3, 4, 4, 4, 5, 3, 3, 5, 2, 3, 5, 1, 4, 4, 4, 6, 1, 3, 4, 7, 3, 4, 4, 6, 3, 8, 3, 3, 5, 1, 7, 7, 1, 3, 6, 8, 2, 4, 1, 8, 7, 1, 7, 1, 3, 7, 1, 7, 4, 6, 1, 4, 2, 6, 3, 10, 6, 8, 2, 5, 1, 2, 5, 6, 14, 7, 1, 8, 1, 1, 3, 3, 7, 5, 6, 2, 2, 7, 1, 1, 9, 6, 1, 4, 2, 5, 5, 3, 1, 3, 5, 5, 3, 4, 2, 2, 5, 2, 2, 3, 1, 1, 3, 3, 4, 3, 8, 1, 5, 4, 2, 7, 2, 5, 2, 2, 5, 1, 1, 5, 2, 3, 5, 1, 5, 2, 4, 9, 4, 4, 4, 4, 2, 4, 7, 2, 2, 6, 1, 1, 2, 4, 3, 1, 3, 4, 4, 3, 4, 1, 3, 5, 1, 3, 6, 5, 1, 5, 6, 9, 3, 8, 7, 1, 4, 9, 4, 7, 7, 1, 3, 2, 5, 4, 3, 6, 4, 2, 2, 3, 3, 2, 2, 6, 1, 3, 4, 5, 6, 7, 6, 6, 3, 5, 2, 4, 7, 1, 1, 5, 1, 2, 2, 5, 1, 4, 1, 2, 5, 1, 4, 2, 4, 3, 13, 4, 4, 5, 7, 2, 3, 1, 3, 9, 2, 3, 10, 4, 2, 3, 6, 2, 2, 7, 1, 1, 8, 1, 1, 6, 1, 1, 3, 6, 3, 8, 3, 4, 3, 3, 7, 6, 4, 3, 1, 5, 5, 3, 6, 3, 11, 6, 7, 1, 4, 2, 2, 4, 2, 6, 2, 4, 5, 2, 6, 2, 4, 2, 2, 7, 1, 1, 5, 1, 1, 8, 3, 2, 9, 3, 4, 2, 3, 3, 1, 4, 5, 7, 1, 5, 4, 6, 3, 5, 8, 2, 3, 3, 1, 8, 1, 5, 3, 6, 1, 3, 3, 2, 1, 4, 9, 4, 1, 3, 4, 4, 7, 2, 1, 4, 1, 7, 1, 7, 3, 5, 2, 3, 5, 1, 3, 7, 3, 3, 6, 3, 4, 1, 4, 2, 2, 5, 1, 1, 6, 1, 1, 3, 4, 7, 2, 1, 2, 7, 4, 7, 4, 4, 1, 4, 2, 8, 5, 1, 6, 1, 5, 3, 5, 2, 4, 5, 5, 3, 5, 5, 1, 2, 5, 2, 7, 4, 2, 3, 4, 1, 3, 3, 4, 6, 4, 7, 2, 3, 3, 7, 1, 2, 5, 4, 4, 5, 1, 5, 1, 4, 5, 5, 4, 1, 2, ...]
fdist = FreqDist(len(w) for w in text1)
print(fdist)
<FreqDist with 19 samples and 260819 outcomes>
fdist
FreqDist({3: 50223, 1: 47933, 4: 42345, 2: 38513, 5: 26597, 6: 17111, 7: 14399, 8: 9966, 9: 6428, 10: 3528, ...})
# FreqDist({num=> stand for length : num=>how frquent 예컨대 3짜리 단어가 얼만큼 등장했는지를 알려주는 것.})
len('length')
6
fdist.most_common()
[(3, 50223), (1, 47933), (4, 42345), (2, 38513), (5, 26597), (6, 17111), (7, 14399), (8, 9966), (9, 6428), (10, 3528), (11, 1873), (12, 1053), (13, 567), (14, 177), (15, 70), (16, 22), (17, 12), (18, 1), (20, 1)]
fdist.max()
#here, '.' is like a glue? fdist is FrequentDist blabla and .max / .most_commonis a way to call function?###
3
fdist[3]
# [] is index?
50223
fdist.freq(3)
0.19255882431878046
sent7
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
[w for w in sent7 if len(w)<4]
[',', '61', 'old', ',', 'the', 'as', 'a', '29', '.']
[w for w in sent7 if len(w)<=4]
[',', '61', 'old', ',', 'will', 'join', 'the', 'as', 'a', 'Nov.', '29', '.']
sorted(s for s in set(sent7) if s.startswith('P'))
['Pierre']
sorted(s for s in set(sent7) if s.endswith('a'))
['a']
sorted(s for s in set(sent7) if s.islower())
['a', 'as', 'board', 'director', 'join', 'nonexecutive', 'old', 'the', 'will', 'years']
sorted(s for s in set(sent7) if s.isupper())
[]
sorted(s for s in set(sent7) if s.isalpha())
#Q. I don't get this, what does this mean?
['Pierre', 'Vinken', 'a', 'as', 'board', 'director', 'join', 'nonexecutive', 'old', 'the', 'will', 'years']
sorted(s for s in set(sent7) if 'exe' in s)
['nonexecutive']
sorted(s for s in set(sent7) if s.isalnum())
#Q. I don't get this, what does this mean?
['29', '61', 'Pierre', 'Vinken', 'a', 'as', 'board', 'director', 'join', 'nonexecutive', 'old', 'the', 'will', 'years']
sorted(s for s in set(sent7) if s.isdigit())
['29', '61']
sorted(s for s in set(sent7) if s.istitle())
['Nov.', 'Pierre', 'Vinken']
sorted(w for w in set(text7) if '-' in w and 'index in w')
['*-1', '*-10', '*-100', '*-101', '*-102', '*-103', '*-104', '*-105', '*-106', '*-107', '*-108', '*-109', '*-11', '*-110', '*-111', '*-112', '*-113', '*-114', '*-115', '*-116', '*-117', '*-118', '*-119', '*-12', '*-120', '*-121', '*-122', '*-123', '*-124', '*-125', '*-126', '*-127', '*-128', '*-129', '*-13', '*-130', '*-131', '*-132', '*-133', '*-134', '*-135', '*-136', '*-137', '*-138', '*-139', '*-14', '*-140', '*-141', '*-142', '*-144', '*-145', '*-146', '*-147', '*-149', '*-15', '*-150', '*-151', '*-152', '*-153', '*-154', '*-155', '*-156', '*-157', '*-158', '*-159', '*-16', '*-160', '*-161', '*-162', '*-163', '*-164', '*-165', '*-166', '*-17', '*-18', '*-19', '*-2', '*-20', '*-21', '*-22', '*-23', '*-24', '*-25', '*-26', '*-27', '*-28', '*-29', '*-3', '*-30', '*-31', '*-32', '*-33', '*-34', '*-35', '*-36', '*-37', '*-38', '*-39', '*-4', '*-40', '*-41', '*-42', '*-43', '*-44', '*-45', '*-46', '*-47', '*-48', '*-49', '*-5', '*-50', '*-51', '*-52', '*-53', '*-54', '*-55', '*-56', '*-57', '*-58', '*-59', '*-6', '*-60', '*-61', '*-62', '*-63', '*-64', '*-66', '*-67', '*-68', '*-69', '*-7', '*-70', '*-71', '*-72', '*-73', '*-74', '*-75', '*-76', '*-77', '*-78', '*-79', '*-8', '*-80', '*-81', '*-82', '*-83', '*-84', '*-85', '*-86', '*-87', '*-88', '*-89', '*-9', '*-90', '*-91', '*-92', '*-93', '*-94', '*-95', '*-96', '*-97', '*-98', '*-99', '*EXP*-1', '*EXP*-2', '*EXP*-3', '*ICH*-1', '*ICH*-2', '*ICH*-3', '*ICH*-4', '*PPA*-1', '*PPA*-2', '*PPA*-3', '*RNR*-1', '*RNR*-2', '*RNR*-4', '*T*-1', '*T*-10', '*T*-100', '*T*-101', '*T*-102', '*T*-103', '*T*-104', '*T*-105', '*T*-106', '*T*-107', '*T*-108', '*T*-109', '*T*-11', '*T*-110', '*T*-111', '*T*-112', '*T*-113', '*T*-114', '*T*-115', '*T*-116', '*T*-117', '*T*-118', '*T*-119', '*T*-12', '*T*-120', '*T*-121', '*T*-122', '*T*-123', '*T*-124', '*T*-125', '*T*-126', '*T*-127', '*T*-128', '*T*-129', '*T*-13', '*T*-130', '*T*-131', '*T*-132', '*T*-133', '*T*-134', '*T*-135', '*T*-136', '*T*-137', '*T*-138', '*T*-139', '*T*-14', '*T*-140', '*T*-141', '*T*-142', '*T*-143', '*T*-144', '*T*-145', '*T*-146', '*T*-147', '*T*-148', '*T*-149', '*T*-15', '*T*-150', '*T*-151', '*T*-152', '*T*-153', '*T*-154', '*T*-155', '*T*-156', '*T*-157', '*T*-158', '*T*-159', '*T*-16', '*T*-160', '*T*-161', '*T*-162', '*T*-163', '*T*-164', '*T*-165', '*T*-166', '*T*-167', '*T*-168', '*T*-169', '*T*-17', '*T*-170', '*T*-171', '*T*-172', '*T*-173', '*T*-174', '*T*-175', '*T*-176', '*T*-177', '*T*-178', '*T*-179', '*T*-18', '*T*-180', '*T*-181', '*T*-182', '*T*-183', '*T*-184', '*T*-185', '*T*-186', '*T*-187', '*T*-188', '*T*-189', '*T*-19', '*T*-190', '*T*-191', '*T*-192', '*T*-193', '*T*-194', '*T*-195', '*T*-196', '*T*-197', '*T*-198', '*T*-199', '*T*-2', '*T*-20', '*T*-200', '*T*-201', '*T*-202', '*T*-203', '*T*-204', '*T*-205', '*T*-206', '*T*-207', '*T*-208', '*T*-21', '*T*-210', '*T*-211', '*T*-212', '*T*-213', '*T*-214', '*T*-215', '*T*-216', '*T*-217', '*T*-218', '*T*-219', '*T*-22', '*T*-220', '*T*-221', '*T*-222', '*T*-223', '*T*-224', '*T*-225', '*T*-226', '*T*-227', '*T*-228', '*T*-229', '*T*-23', '*T*-230', '*T*-231', '*T*-232', '*T*-233', '*T*-234', '*T*-235', '*T*-236', '*T*-237', '*T*-238', '*T*-239', '*T*-24', '*T*-240', '*T*-241', '*T*-242', '*T*-243', '*T*-244', '*T*-245', '*T*-246', '*T*-247', '*T*-248', '*T*-249', '*T*-25', '*T*-250', '*T*-251', '*T*-252', '*T*-253', '*T*-254', '*T*-255', '*T*-256', '*T*-257', '*T*-258', '*T*-259', '*T*-26', '*T*-260', '*T*-27', '*T*-28', '*T*-29', '*T*-3', '*T*-30', '*T*-31', '*T*-32', '*T*-33', '*T*-34', '*T*-35', '*T*-36', '*T*-37', '*T*-38', '*T*-39', '*T*-4', '*T*-40', '*T*-41', '*T*-42', '*T*-43', '*T*-44', '*T*-45', '*T*-46', '*T*-47', '*T*-48', '*T*-49', '*T*-5', '*T*-50', '*T*-51', '*T*-52', '*T*-53', '*T*-54', '*T*-55', '*T*-56', '*T*-57', '*T*-58', '*T*-59', '*T*-6', '*T*-60', '*T*-61', '*T*-62', '*T*-63', '*T*-64', '*T*-65', '*T*-66', '*T*-67', '*T*-68', '*T*-69', '*T*-7', '*T*-70', '*T*-71', '*T*-72', '*T*-73', '*T*-74', '*T*-75', '*T*-76', '*T*-77', '*T*-78', '*T*-79', '*T*-8', '*T*-80', '*T*-81', '*T*-82', '*T*-83', '*T*-84', '*T*-85', '*T*-86', '*T*-87', '*T*-88', '*T*-89', '*T*-9', '*T*-90', '*T*-91', '*T*-92', '*T*-93', '*T*-94', '*T*-95', '*T*-96', '*T*-97', '*T*-98', '*T*-99', '-', '--', '-LCB-', '-LRB-', '-RCB-', '-RRB-', '10-day', '10-lap', '10-year', '100-megabyte', '100-share', '11-month-old', '12-member', '12-point', '12-year', '120-a-share', '14-hour', '15-day', '150-point', '17-year-old', '18-a-share', '18-year-old', '19-month-old', '190-point', '1928-33', '1937-40', '1973-75', '1983-85', '1986-87', '1987-88', '1988-89', '1989-90', '1990-91', '1991-1999', '1991-2000', '1992-1999', '2,050-passenger', '2-3', '2-8', '20-point', '20-stock', '21-month', '230-215', '237-seat', '238,000-circulation', '240-page', '25-year-old', '27-year', '271-147', '3-4', '30-day', '30-minute', '30-point', '30-share', '30-year', '300-113', '300-a-share', '300-day', '31-year-old', '36-day', '36-minute', '36-store', '37-a-share', '37-year-old', '382-37', '40-megabyte', '40-year-old', '42-year', '43-year-old', '45-a-share', '50-50', '50-state', '500-Stock', '500-stock', '51-year-old', '52-week', '520-lawyer', '53-year-old', '54-year-old', '55-a-share', '55-year-old', '58-year-old', '59-year-old', '62%-owned', '62-year-old', '63-year-old', '64-year-old', '69-point', '70-a-share', '75-year-old', '84-month', '84-year-old', '87-store', '90-cent-an-hour', '90-day', 'A-D', 'AC-130U', 'American-made', 'American-style', 'Anti-Deficiency', 'Arbitrage-related', 'Atlanta-based', 'B-1B', 'BMP-1', 'Bermuda-based', 'Biondi-Santi', 'C-130', 'C-90', 'Calif.-based', 'Carnegie-Mellon', 'Catch-22', 'Change-ringing', 'Chicago-style', 'Chinese-American', 'Chong-sik', 'Coca-Cola', 'Coche-Dury', 'Corton-Charlemagne', 'Cost-effective', 'Cray-3', 'Cup-Tote', 'Dai-Ichi', 'Dana-Farber', 'Deposits-a', 'Dollar-yen', 'English-speaking', 'Express-Buick', 'Exxon-owned', 'F-series', 'FT-SE', 'Follow-up', 'Francisco-based', 'Freeport-McMoRan', 'Gates-Warren', 'Georgia-Pacific', 'Germany-based', 'Hart-Scott-Rodino', 'Haut-Brion', 'Herald-American', 'High-grade', 'House-Senate', 'INGERSOLL-RAND', 'INTER-TEL', 'Index-arbitrage', 'Integra-A', 'Interleukin-3', 'Iran-Contra', 'Jersey-based', 'Johnson-era', 'Junk-bond', 'Lafite-Rothschild', 'London-based', 'Long-term', 'Louisiana-Pacific', 'Macmillan\\/McGraw-Hill', 'Marie-Louise', 'McGraw-Hill', 'Miami-based', 'Mich.-based', 'Minneapolis-based', 'Mo.-based', 'Mortgage-Backed', 'N.J.-based', 'NBC-owned', 'NIH-appointed', 'News-American', 'Nissho-Iwai', 'No-Smoking', 'Odd-year', 'Off-Track', 'Ohio-based', 'Old-House', 'Orleans-based', 'Papua-New', 'Per-share', 'Philadelphia-based', 'Pro-forma', 'Prudential-Bache', 'Reagan-Bush', 'Red-blooded', 'Renaissance-style', 'Rent-A-Car', 'Retin-A', 'Romanee-Conti', 'Sacramento-based', 'Seattle-based', 'Senate-House', 'Sino-U.S.', 'Stock-index', 'T-shirts', 'TRS-80', 'Test-preparation', 'Texas-based', 'Times-Stock', 'Toronto-based', 'U.N.-supervised', 'U.S.-Japan', 'U.S.-Japanese', 'U.S.-backed', 'Uzi-model', 'Washington-based', 'World-Wide', 'York-based', 'abortion-related', 'above-market', 'achievement-test', 'acid-rain', 'acquisition-minded', 'after-tax', 'agreed-upon', 'airline-related', 'all-cash', 'anti-China', 'anti-abortion', 'anti-abortionists', 'anti-drug', 'anti-dumping', 'anti-miscarriage', 'anti-morning-sickness', 'anti-program', 'anti-programmers', 'anti-takeover', 'antitrust-law', 'asbestos-related', 'assembly-line', 'asset-sale', 'asset-valuation', 'attorney-client', 'auto-safety', 'automotive-lighting', 'automotive-parts', 'bald-faced', 'band-wagon', 'bank-backed', 'bankruptcy-law', 'battery-operated', 'beer-belly', 'bell-ringer', 'bell-ringing', 'best-seller', 'best-selling', 'big-ticket', 'big-time', 'billion-dollar', 'black-and-white', 'blood-cell', 'blue-chip', 'blue-chips', 'blue-collar', 'bottom-line', 'bread-and-butter', 'broad-based', 'building-products', 'built-from-kit', 'buttoned-down', 'buy-back', 'buy-out', 'buy-outs', 'cancer-causing', 'capital-gains', 'capital-markets', 'car-care', 'car-development', 'car-safety', 'card-member', 'cash-and-stock', 'cash-flow', 'cash-rich', 'cast-iron', 'catch-up', 'cease-fire', 'cents-a-unit', 'change-ringing', 'church-goers', 'circuit-board', 'circuit-breaker', 'citizen-sparked', 'city-owned', 'clean-air', 'clean-up', 'cleaner-burning', 'close-up', 'closed-end', 'co-author', 'co-chairman', 'co-developers', 'co-founded', 'co-owner', 'coal-fired', 'collective-bargaining', 'college-bowl', 'common-law', 'computer-aided', 'computer-assisted', 'computer-driven', 'computer-generated', 'computer-services', 'computer-system-design', 'constitutional-law', 'consumer-driven', 'contingency-fee', 'cop-killer', 'copper-rich', 'corn-buying', 'corporate-wide', 'cost-benefit', 'cost-control', 'cost-cutting', 'cost-sharing', 'court-ordered', 'crane-safety', 'credit-rating', 'cross-border', 'crystal-lattice', 'current-carrying', 'custom-chip', 'day-care', 'day-to-day', 'dead-eyed', 'decade-long', 'detective-story', 'direct-investment', 'direct-mail', 'disaster-assistance', 'do-it-yourself', 'dollar-denominated', 'double-A', 'double-C', 'double-digit', 'drag-down', 'drop-in', 'drop-off', 'durable-goods', 'dust-up', 'duty-free', 'early-retirement', 'easy-to-read', 'eight-count', 'eight-month', 'eight-person', 'electric-utility', 'electrical-safety', 'energy-services', 'equal-opportunity', 'equity-purchase', 'ex-dividend', 'executive-office', 'export-oriented', 'family-planning', 'fast-food', 'fast-growing', 'fastest-growing', 'father-in-law', 'fetal-tissue', 'fiber-end', 'fiber-optic', 'fifth-grade', 'fifth-largest', 'financial-services', 'first-half', 'first-rate', 'first-time', 'five-cent', 'five-day', 'five-inch', 'five-point', 'five-year', 'fixed-income', 'fixed-price', 'fixed-rate', 'flim-flammery', 'floating-rate', 'food-industry', 'food-shop', 'foreign-led', 'foreign-stock', 'forest-product', 'forest-products', 'four-color', 'four-day', 'four-foot-high', 'four-year', 'four-year-old', 'fourth-quarter', 'free-enterprise', 'free-lance', 'front-seat', 'full-fledged', 'full-length', 'full-time', 'full-year', 'futures-related', 'garden-variety', 'get-out-the-vote', 'good-faith', 'good-hearted', 'good-natured', 'government-certified', 'government-funded', 'government-owned', 'gut-wrenching', 'half-hour', 'hard-charging', 'hard-drinking', 'hard-hitting', 'hardest-hit', 'headcount-control', 'heavy-duty', 'heavy-truck', 'heebie-jeebies', 'high-balance', 'high-flying', 'high-level', 'high-minded', 'high-polluting', 'high-priced', 'high-quality', 'high-rate', 'high-rise', 'high-risk', 'high-rolling', 'high-speed', 'high-stakes', 'high-tech', 'high-technology', 'high-volume', 'high-yield', 'higher-salaried', 'highest-pitched', 'home-market', 'housing-assistance', 'how-to', 'identity-management', 'in-store', 'incentive-backed', 'incentive-bonus', 'index-arbitrage', 'index-fund', 'index-options', 'index-related', 'industrial-production', 'industry-supported', 'industry-wide', 'information-services', 'inner-city', 'insider-trading', 'insurance-company', 'intellectual-property', 'interest-bearing', 'interest-rate', 'investment-grade', 'investor-relations', 'joint-venture', 'junk-bond', 'labor-backed', 'labor-intensive', 'labor-management', 'land-idling', 'language-housekeeper', 'lap-shoulder', 'large-scale', 'larger-than-normal', 'launch-vehicle', 'lend-lease', 'less-serious', 'less-than-brilliant', 'letter-writing', 'life-insurance', 'life-of-contract', 'life-style', 'light-truck', 'lightning-fast', 'limited-partnership', 'line-item', 'liquid-nitrogen', 'long-tenured', 'long-term', 'long-time', 'longer-term', 'lookee-loos', 'low-ability', 'low-altitude', 'low-ball', 'low-cost', 'low-priced', 'low-tech', 'lower-priced', 'lower-priority', 'lynch-mob', 'machine-gun-toting', 'major-league', 'male-dominated', 'male-only', 'manmade-fiber', 'market-based', 'market-makers', 'market-oriented', 'market-share', 'marketing-communications', 'medium-sized', 'mega-stadium', 'melt-textured', 'merger-related', 'mid-1970s', 'mid-1990s', 'mid-October', 'mid-afternoon', 'mid-size', 'middle-ground', 'million-a-year', 'mind-boggling', 'minimum-wage', 'mininum-wage', 'minority-owned', 'modern-day', 'money-center', 'money-fund', 'money-losing', 'money-market', 'morale-damaging', 'more-advanced', 'more-efficient', 'mortgage-backed', 'mortgage-based', 'most-likely-successor', 'motor-home', 'mouth-up', 'much-larger', 'multi-crystal', 'multibillion-dollar', 'mutual-fund', 'natural-gas', 'near-limit', 'near-record', 'nearly-30', 'needle-like', 'new-car', 'new-home', 'newspaper-printing', 'nine-member', 'nine-month', 'nine-year', 'no-confidence', 'non-U.S.', 'non-biodegradable', 'non-callable', 'non-core', 'non-encapsulating', 'non-farm', 'non-religious', 'odd-sounding', 'odd-year', 'off-off', 'off-year', 'old-fashioned', 'old-style', 'old-time', 'on-campus', 'one-country', 'one-day', 'one-fifth', 'one-hour', 'one-house', 'one-month', 'one-newspaper', ...]
sorted(word for word in set(text3) if word.istitle() and len(word) > 10)
['Abelmizraim', 'Allonbachuth', 'Beerlahairoi', 'Canaanitish', 'Chedorlaomer', 'Girgashites', 'Hazarmaveth', 'Hazezontamar', 'Ishmeelites', 'Jegarsahadutha', 'Jehovahjireh', 'Kirjatharba', 'Melchizedek', 'Mesopotamia', 'Peradventure', 'Philistines', 'Zaphnathpaaneah']
sorted(w2 for w2 in set(sent7) if not w2.islower())
# why there is , and . and numbers? in lowercase
[',', '.', '29', '61', 'Nov.', 'Pierre', 'Vinken']
sorted(t for t in set(text2) if 'cie' in t or 'cei' in t)
['ancient', 'ceiling', 'conceit', 'conceited', 'conceive', 'conscience', 'conscientious', 'conscientiously', 'deceitful', 'deceive', 'deceived', 'deceiving', 'deficiencies', 'deficiency', 'deficient', 'delicacies', 'excellencies', 'fancied', 'insufficiency', 'insufficient', 'legacies', 'perceive', 'perceived', 'perceiving', 'prescience', 'prophecies', 'receipt', 'receive', 'received', 'receiving', 'society', 'species', 'sufficient', 'sufficiently', 'undeceive', 'undeceiving']
[len(w) for w in text1]
[1, 4, 4, 2, 6, 8, 4, 1, 9, 1, 1, 8, 2, 1, 4, 11, 5, 2, 1, 7, 6, 1, 3, 4, 5, 2, 10, 2, 4, 1, 5, 1, 4, 1, 3, 5, 1, 1, 3, 3, 3, 1, 2, 3, 4, 7, 3, 3, 8, 3, 8, 1, 4, 1, 5, 12, 1, 9, 11, 4, 3, 3, 3, 5, 2, 3, 3, 5, 7, 2, 3, 5, 1, 2, 5, 2, 4, 3, 3, 8, 1, 2, 7, 6, 8, 3, 2, 3, 9, 1, 1, 5, 3, 4, 2, 4, 2, 6, 6, 1, 3, 2, 5, 4, 2, 4, 4, 1, 5, 1, 4, 2, 2, 2, 6, 2, 3, 6, 7, 3, 1, 7, 9, 1, 3, 6, 1, 1, 5, 6, 5, 6, 3, 13, 2, 3, 4, 1, 3, 7, 4, 5, 2, 3, 4, 2, 2, 8, 1, 5, 1, 3, 2, 1, 3, 3, 1, 4, 1, 4, 6, 2, 5, 4, 9, 2, 7, 1, 3, 2, 3, 1, 5, 2, 6, 2, 7, 2, 2, 7, 1, 1, 10, 1, 5, 1, 3, 2, 2, 4, 11, 4, 3, 3, 1, 3, 3, 1, 6, 1, 1, 1, 1, 1, 4, 1, 3, 1, 2, 4, 1, 2, 6, 2, 2, 10, 1, 1, 10, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 6, 1, 3, 1, 5, 1, 4, 1, 7, 1, 5, 1, 9, 1, 5, 1, 7, 1, 7, 1, 6, 1, 7, 1, 7, 1, 5, 1, 4, 1, 4, 1, 5, 1, 5, 1, 4, 1, 4, 1, 11, 1, 8, 1, 8, 2, 1, 3, 1, 3, 1, 9, 2, 2, 4, 2, 4, 4, 4, 4, 11, 8, 3, 4, 1, 4, 2, 1, 4, 5, 2, 1, 3, 1, 3, 7, 2, 4, 4, 7, 3, 4, 8, 3, 6, 1, 6, 2, 3, 5, 1, 7, 2, 8, 6, 9, 2, 6, 2, 5, 7, 4, 2, 3, 4, 10, 1, 6, 2, 7, 1, 9, 3, 4, 3, 1, 2, 5, 4, 2, 5, 1, 4, 3, 8, 1, 8, 5, 10, 1, 7, 9, 1, 2, 5, 8, 1, 3, 9, 6, 8, 1, 3, 4, 2, 1, 2, 8, 3, 7, 7, 9, 1, 2, 4, 2, 3, 5, 4, 9, 1, 5, 8, 3, 6, 8, 2, 12, 1, 2, 9, 1, 8, 4, 1, 1, 3, 4, 2, 4, 3, 4, 13, 4, 1, 7, 1, 7, 1, 3, 4, 2, 9, 1, 2, 4, 7, 3, 11, 1, 9, 3, 3, 1, 2, 4, 4, 4, 1, 4, 5, 2, 1, 3, 1, 3, 1, 5, 11, 1, 2, 1, 4, 9, 2, 4, 8, 1, 6, 5, 5, 2, 4, 2, 4, 5, 4, 4, 4, 1, 3, 3, 4, 4, 4, 6, 5, 2, 3, 4, 1, 6, 1, 3, 4, 4, 3, 9, 5, 2, 3, 1, 3, 4, 4, 1, 8, 1, 3, 1, 3, 4, 9, 4, 5, 1, 3, 3, 2, 4, 7, 1, 4, 4, 4, 3, 5, 7, 1, 3, 2, 3, 10, 10, 7, 2, 4, 2, 2, 1, 3, 1, 4, 1, 3, 2, 3, 4, 3, 4, 5, 2, 4, 2, 6, 3, 5, 1, 2, 2, 4, 3, 4, 5, 2, 3, 4, 2, 9, 1, 5, 4, 1, 5, 5, 3, 7, 5, 3, 3, 9, 3, 2, 1, 3, 4, 4, 4, 5, 3, 3, 5, 2, 3, 5, 1, 4, 4, 4, 6, 1, 3, 4, 7, 3, 4, 4, 6, 3, 8, 3, 3, 5, 1, 7, 7, 1, 3, 6, 8, 2, 4, 1, 8, 7, 1, 7, 1, 3, 7, 1, 7, 4, 6, 1, 4, 2, 6, 3, 10, 6, 8, 2, 5, 1, 2, 5, 6, 14, 7, 1, 8, 1, 1, 3, 3, 7, 5, 6, 2, 2, 7, 1, 1, 9, 6, 1, 4, 2, 5, 5, 3, 1, 3, 5, 5, 3, 4, 2, 2, 5, 2, 2, 3, 1, 1, 3, 3, 4, 3, 8, 1, 5, 4, 2, 7, 2, 5, 2, 2, 5, 1, 1, 5, 2, 3, 5, 1, 5, 2, 4, 9, 4, 4, 4, 4, 2, 4, 7, 2, 2, 6, 1, 1, 2, 4, 3, 1, 3, 4, 4, 3, 4, 1, 3, 5, 1, 3, 6, 5, 1, 5, 6, 9, 3, 8, 7, 1, 4, 9, 4, 7, 7, 1, 3, 2, 5, 4, 3, 6, 4, 2, 2, 3, 3, 2, 2, 6, 1, 3, 4, 5, 6, 7, 6, 6, 3, 5, 2, 4, 7, 1, 1, 5, 1, 2, 2, 5, 1, 4, 1, 2, 5, 1, 4, 2, 4, 3, 13, 4, 4, 5, 7, 2, 3, 1, 3, 9, 2, 3, 10, 4, 2, 3, 6, 2, 2, 7, 1, 1, 8, 1, 1, 6, 1, 1, 3, 6, 3, 8, 3, 4, 3, 3, 7, 6, 4, 3, 1, 5, 5, 3, 6, 3, 11, 6, 7, 1, 4, 2, 2, 4, 2, 6, 2, 4, 5, 2, 6, 2, 4, 2, 2, 7, 1, 1, 5, 1, 1, 8, 3, 2, 9, 3, 4, 2, 3, 3, 1, 4, 5, 7, 1, 5, 4, 6, 3, 5, 8, 2, 3, 3, 1, 8, 1, 5, 3, 6, 1, 3, 3, 2, 1, 4, 9, 4, 1, 3, 4, 4, 7, 2, 1, 4, 1, 7, 1, 7, 3, 5, 2, 3, 5, 1, 3, 7, 3, 3, 6, 3, 4, 1, 4, 2, 2, 5, 1, 1, 6, 1, 1, 3, 4, 7, 2, 1, 2, 7, 4, 7, 4, 4, 1, 4, 2, 8, 5, 1, 6, 1, 5, 3, 5, 2, 4, 5, 5, 3, 5, 5, 1, 2, 5, 2, 7, 4, 2, 3, 4, 1, 3, 3, 4, 6, 4, 7, 2, 3, 3, 7, 1, 2, 5, 4, 4, 5, 1, 5, 1, 4, 5, 5, 4, 1, 2, ...]
[w.upper() for w in text1]
['[', 'MOBY', 'DICK', 'BY', 'HERMAN', 'MELVILLE', '1851', ']', 'ETYMOLOGY', '.', '(', 'SUPPLIED', 'BY', 'A', 'LATE', 'CONSUMPTIVE', 'USHER', 'TO', 'A', 'GRAMMAR', 'SCHOOL', ')', 'THE', 'PALE', 'USHER', '--', 'THREADBARE', 'IN', 'COAT', ',', 'HEART', ',', 'BODY', ',', 'AND', 'BRAIN', ';', 'I', 'SEE', 'HIM', 'NOW', '.', 'HE', 'WAS', 'EVER', 'DUSTING', 'HIS', 'OLD', 'LEXICONS', 'AND', 'GRAMMARS', ',', 'WITH', 'A', 'QUEER', 'HANDKERCHIEF', ',', 'MOCKINGLY', 'EMBELLISHED', 'WITH', 'ALL', 'THE', 'GAY', 'FLAGS', 'OF', 'ALL', 'THE', 'KNOWN', 'NATIONS', 'OF', 'THE', 'WORLD', '.', 'HE', 'LOVED', 'TO', 'DUST', 'HIS', 'OLD', 'GRAMMARS', ';', 'IT', 'SOMEHOW', 'MILDLY', 'REMINDED', 'HIM', 'OF', 'HIS', 'MORTALITY', '.', '"', 'WHILE', 'YOU', 'TAKE', 'IN', 'HAND', 'TO', 'SCHOOL', 'OTHERS', ',', 'AND', 'TO', 'TEACH', 'THEM', 'BY', 'WHAT', 'NAME', 'A', 'WHALE', '-', 'FISH', 'IS', 'TO', 'BE', 'CALLED', 'IN', 'OUR', 'TONGUE', 'LEAVING', 'OUT', ',', 'THROUGH', 'IGNORANCE', ',', 'THE', 'LETTER', 'H', ',', 'WHICH', 'ALMOST', 'ALONE', 'MAKETH', 'THE', 'SIGNIFICATION', 'OF', 'THE', 'WORD', ',', 'YOU', 'DELIVER', 'THAT', 'WHICH', 'IS', 'NOT', 'TRUE', '."', '--', 'HACKLUYT', '"', 'WHALE', '.', '...', 'SW', '.', 'AND', 'DAN', '.', 'HVAL', '.', 'THIS', 'ANIMAL', 'IS', 'NAMED', 'FROM', 'ROUNDNESS', 'OR', 'ROLLING', ';', 'FOR', 'IN', 'DAN', '.', 'HVALT', 'IS', 'ARCHED', 'OR', 'VAULTED', '."', '--', 'WEBSTER', "'", 'S', 'DICTIONARY', '"', 'WHALE', '.', '...', 'IT', 'IS', 'MORE', 'IMMEDIATELY', 'FROM', 'THE', 'DUT', '.', 'AND', 'GER', '.', 'WALLEN', ';', 'A', '.', 'S', '.', 'WALW', '-', 'IAN', ',', 'TO', 'ROLL', ',', 'TO', 'WALLOW', '."', '--', 'RICHARDSON', "'", 'S', 'DICTIONARY', 'KETOS', ',', 'GREEK', '.', 'CETUS', ',', 'LATIN', '.', 'WHOEL', ',', 'ANGLO', '-', 'SAXON', '.', 'HVALT', ',', 'DANISH', '.', 'WAL', ',', 'DUTCH', '.', 'HWAL', ',', 'SWEDISH', '.', 'WHALE', ',', 'ICELANDIC', '.', 'WHALE', ',', 'ENGLISH', '.', 'BALEINE', ',', 'FRENCH', '.', 'BALLENA', ',', 'SPANISH', '.', 'PEKEE', '-', 'NUEE', '-', 'NUEE', ',', 'FEGEE', '.', 'PEKEE', '-', 'NUEE', '-', 'NUEE', ',', 'ERROMANGOAN', '.', 'EXTRACTS', '(', 'SUPPLIED', 'BY', 'A', 'SUB', '-', 'SUB', '-', 'LIBRARIAN', ').', 'IT', 'WILL', 'BE', 'SEEN', 'THAT', 'THIS', 'MERE', 'PAINSTAKING', 'BURROWER', 'AND', 'GRUB', '-', 'WORM', 'OF', 'A', 'POOR', 'DEVIL', 'OF', 'A', 'SUB', '-', 'SUB', 'APPEARS', 'TO', 'HAVE', 'GONE', 'THROUGH', 'THE', 'LONG', 'VATICANS', 'AND', 'STREET', '-', 'STALLS', 'OF', 'THE', 'EARTH', ',', 'PICKING', 'UP', 'WHATEVER', 'RANDOM', 'ALLUSIONS', 'TO', 'WHALES', 'HE', 'COULD', 'ANYWAYS', 'FIND', 'IN', 'ANY', 'BOOK', 'WHATSOEVER', ',', 'SACRED', 'OR', 'PROFANE', '.', 'THEREFORE', 'YOU', 'MUST', 'NOT', ',', 'IN', 'EVERY', 'CASE', 'AT', 'LEAST', ',', 'TAKE', 'THE', 'HIGGLEDY', '-', 'PIGGLEDY', 'WHALE', 'STATEMENTS', ',', 'HOWEVER', 'AUTHENTIC', ',', 'IN', 'THESE', 'EXTRACTS', ',', 'FOR', 'VERITABLE', 'GOSPEL', 'CETOLOGY', '.', 'FAR', 'FROM', 'IT', '.', 'AS', 'TOUCHING', 'THE', 'ANCIENT', 'AUTHORS', 'GENERALLY', ',', 'AS', 'WELL', 'AS', 'THE', 'POETS', 'HERE', 'APPEARING', ',', 'THESE', 'EXTRACTS', 'ARE', 'SOLELY', 'VALUABLE', 'OR', 'ENTERTAINING', ',', 'AS', 'AFFORDING', 'A', 'GLANCING', 'BIRD', "'", 'S', 'EYE', 'VIEW', 'OF', 'WHAT', 'HAS', 'BEEN', 'PROMISCUOUSLY', 'SAID', ',', 'THOUGHT', ',', 'FANCIED', ',', 'AND', 'SUNG', 'OF', 'LEVIATHAN', ',', 'BY', 'MANY', 'NATIONS', 'AND', 'GENERATIONS', ',', 'INCLUDING', 'OUR', 'OWN', '.', 'SO', 'FARE', 'THEE', 'WELL', ',', 'POOR', 'DEVIL', 'OF', 'A', 'SUB', '-', 'SUB', ',', 'WHOSE', 'COMMENTATOR', 'I', 'AM', '.', 'THOU', 'BELONGEST', 'TO', 'THAT', 'HOPELESS', ',', 'SALLOW', 'TRIBE', 'WHICH', 'NO', 'WINE', 'OF', 'THIS', 'WORLD', 'WILL', 'EVER', 'WARM', ';', 'AND', 'FOR', 'WHOM', 'EVEN', 'PALE', 'SHERRY', 'WOULD', 'BE', 'TOO', 'ROSY', '-', 'STRONG', ';', 'BUT', 'WITH', 'WHOM', 'ONE', 'SOMETIMES', 'LOVES', 'TO', 'SIT', ',', 'AND', 'FEEL', 'POOR', '-', 'DEVILISH', ',', 'TOO', ';', 'AND', 'GROW', 'CONVIVIAL', 'UPON', 'TEARS', ';', 'AND', 'SAY', 'TO', 'THEM', 'BLUNTLY', ',', 'WITH', 'FULL', 'EYES', 'AND', 'EMPTY', 'GLASSES', ',', 'AND', 'IN', 'NOT', 'ALTOGETHER', 'UNPLEASANT', 'SADNESS', '--', 'GIVE', 'IT', 'UP', ',', 'SUB', '-', 'SUBS', '!', 'FOR', 'BY', 'HOW', 'MUCH', 'THE', 'MORE', 'PAINS', 'YE', 'TAKE', 'TO', 'PLEASE', 'THE', 'WORLD', ',', 'BY', 'SO', 'MUCH', 'THE', 'MORE', 'SHALL', 'YE', 'FOR', 'EVER', 'GO', 'THANKLESS', '!', 'WOULD', 'THAT', 'I', 'COULD', 'CLEAR', 'OUT', 'HAMPTON', 'COURT', 'AND', 'THE', 'TUILERIES', 'FOR', 'YE', '!', 'BUT', 'GULP', 'DOWN', 'YOUR', 'TEARS', 'AND', 'HIE', 'ALOFT', 'TO', 'THE', 'ROYAL', '-', 'MAST', 'WITH', 'YOUR', 'HEARTS', ';', 'FOR', 'YOUR', 'FRIENDS', 'WHO', 'HAVE', 'GONE', 'BEFORE', 'ARE', 'CLEARING', 'OUT', 'THE', 'SEVEN', '-', 'STORIED', 'HEAVENS', ',', 'AND', 'MAKING', 'REFUGEES', 'OF', 'LONG', '-', 'PAMPERED', 'GABRIEL', ',', 'MICHAEL', ',', 'AND', 'RAPHAEL', ',', 'AGAINST', 'YOUR', 'COMING', '.', 'HERE', 'YE', 'STRIKE', 'BUT', 'SPLINTERED', 'HEARTS', 'TOGETHER', '--', 'THERE', ',', 'YE', 'SHALL', 'STRIKE', 'UNSPLINTERABLE', 'GLASSES', '!', 'EXTRACTS', '.', '"', 'AND', 'GOD', 'CREATED', 'GREAT', 'WHALES', '."', '--', 'GENESIS', '.', '"', 'LEVIATHAN', 'MAKETH', 'A', 'PATH', 'TO', 'SHINE', 'AFTER', 'HIM', ';', 'ONE', 'WOULD', 'THINK', 'THE', 'DEEP', 'TO', 'BE', 'HOARY', '."', '--', 'JOB', '.', '"', 'NOW', 'THE', 'LORD', 'HAD', 'PREPARED', 'A', 'GREAT', 'FISH', 'TO', 'SWALLOW', 'UP', 'JONAH', '."', '--', 'JONAH', '.', '"', 'THERE', 'GO', 'THE', 'SHIPS', ';', 'THERE', 'IS', 'THAT', 'LEVIATHAN', 'WHOM', 'THOU', 'HAST', 'MADE', 'TO', 'PLAY', 'THEREIN', '."', '--', 'PSALMS', '.', '"', 'IN', 'THAT', 'DAY', ',', 'THE', 'LORD', 'WITH', 'HIS', 'SORE', ',', 'AND', 'GREAT', ',', 'AND', 'STRONG', 'SWORD', ',', 'SHALL', 'PUNISH', 'LEVIATHAN', 'THE', 'PIERCING', 'SERPENT', ',', 'EVEN', 'LEVIATHAN', 'THAT', 'CROOKED', 'SERPENT', ';', 'AND', 'HE', 'SHALL', 'SLAY', 'THE', 'DRAGON', 'THAT', 'IS', 'IN', 'THE', 'SEA', '."', '--', 'ISAIAH', '"', 'AND', 'WHAT', 'THING', 'SOEVER', 'BESIDES', 'COMETH', 'WITHIN', 'THE', 'CHAOS', 'OF', 'THIS', 'MONSTER', "'", 'S', 'MOUTH', ',', 'BE', 'IT', 'BEAST', ',', 'BOAT', ',', 'OR', 'STONE', ',', 'DOWN', 'IT', 'GOES', 'ALL', 'INCONTINENTLY', 'THAT', 'FOUL', 'GREAT', 'SWALLOW', 'OF', 'HIS', ',', 'AND', 'PERISHETH', 'IN', 'THE', 'BOTTOMLESS', 'GULF', 'OF', 'HIS', 'PAUNCH', '."', '--', 'HOLLAND', "'", 'S', 'PLUTARCH', "'", 'S', 'MORALS', '.', '"', 'THE', 'INDIAN', 'SEA', 'BREEDETH', 'THE', 'MOST', 'AND', 'THE', 'BIGGEST', 'FISHES', 'THAT', 'ARE', ':', 'AMONG', 'WHICH', 'THE', 'WHALES', 'AND', 'WHIRLPOOLES', 'CALLED', 'BALAENE', ',', 'TAKE', 'UP', 'AS', 'MUCH', 'IN', 'LENGTH', 'AS', 'FOUR', 'ACRES', 'OR', 'ARPENS', 'OF', 'LAND', '."', '--', 'HOLLAND', "'", 'S', 'PLINY', '.', '"', 'SCARCELY', 'HAD', 'WE', 'PROCEEDED', 'TWO', 'DAYS', 'ON', 'THE', 'SEA', ',', 'WHEN', 'ABOUT', 'SUNRISE', 'A', 'GREAT', 'MANY', 'WHALES', 'AND', 'OTHER', 'MONSTERS', 'OF', 'THE', 'SEA', ',', 'APPEARED', '.', 'AMONG', 'THE', 'FORMER', ',', 'ONE', 'WAS', 'OF', 'A', 'MOST', 'MONSTROUS', 'SIZE', '.', '...', 'THIS', 'CAME', 'TOWARDS', 'US', ',', 'OPEN', '-', 'MOUTHED', ',', 'RAISING', 'THE', 'WAVES', 'ON', 'ALL', 'SIDES', ',', 'AND', 'BEATING', 'THE', 'SEA', 'BEFORE', 'HIM', 'INTO', 'A', 'FOAM', '."', '--', 'TOOKE', "'", 'S', 'LUCIAN', '.', '"', 'THE', 'TRUE', 'HISTORY', '."', '"', 'HE', 'VISITED', 'THIS', 'COUNTRY', 'ALSO', 'WITH', 'A', 'VIEW', 'OF', 'CATCHING', 'HORSE', '-', 'WHALES', ',', 'WHICH', 'HAD', 'BONES', 'OF', 'VERY', 'GREAT', 'VALUE', 'FOR', 'THEIR', 'TEETH', ',', 'OF', 'WHICH', 'HE', 'BROUGHT', 'SOME', 'TO', 'THE', 'KING', '.', '...', 'THE', 'BEST', 'WHALES', 'WERE', 'CATCHED', 'IN', 'HIS', 'OWN', 'COUNTRY', ',', 'OF', 'WHICH', 'SOME', 'WERE', 'FORTY', '-', 'EIGHT', ',', 'SOME', 'FIFTY', 'YARDS', 'LONG', '.', 'HE', ...]
# [f(w) for ...] or [w.f() for ...] 지금 현재는 이 차이를 이해할 필요가 없음,
len(text1)
260819
text1
<Text: Moby Dick by Herman Melville 1851>
set(text1)
{'bottomless', 'incoherences', 'Epilogue', 'Dismal', 'hypos', 'magnificent', 'Thus', 'insulated', 'Egyptian', 'matter', 'rules', 'idolator', 'romantic', 'sinker', 'gloss', 'volley', 'Burkes', 'tested', 'produces', 'talkest', 'goggling', 'cask', 'conceives', 'Burst', 'shafts', 'Sands', 'Mecca', 'noises', 'accept', 'reluctantly', 'Susan', 'veracity', 'joyfully', 'nodded', 'tint', 'loan', 'corrupt', 'through', 'bulge', 'consequent', 'remonstrating', 'respects', 'Corlaer', 'COOK', 'mannerly', 'parchingly', 'scolloped', 'confronted', 'OLD', 'modelled', 'hugeness', 'lords', 'Jollily', 'triumphal', 'aspect', 'hell', 'Pillar', 'intimate', 'thrice', 'chowders', 'guard', 'fondly', 'hopeful', 'net', 'nurseries', 'inquiring', 'bantering', 'Erroneous', 'tablecloth', 'hatched', 'whist', 'twos', 'beds', 'impeach', 'FISH', 'numbered', 'few', 'cripple', 'spreading', 'lapse', 'blooded', 'Spermacetti', 'credulous', 'javelin', 'Angels', 'swimming', 'discoloured', 'stains', 'mizen', 'contrary', 'inventor', 'gardenny', 'flatter', 'gainsaid', 'abasement', 'kindred', 'dimensions', 'protesting', 'overpowered', 'reduced', 'fiercer', 'squared', 'close', 'MEETING', 'uninterrupted', 'drills', 'prostrate', 'Taking', 'description', 'firmaments', 'nonplussed', 'haze', 'catched', 'soggy', 'travelling', 'ober', 'stands', 'thereby', 'considerable', 'barbaric', 'signing', 'enclosed', 'clerical', 'grunt', 'clothes', 'tugged', 'keys', 'pantaloons', ',', 'auxiliary', 'panoramas', 'against', 'afore', 'unquestionable', 'foreground', 'patchwork', 'dedication', 'straggling', 'comprehending', 'Platonist', 'maledictions', 'serpentines', 'throttle', 'winter', 'empires', 'principles', 'vision', 'believer', 'Cooke', 'Latter', 'along', 'feasts', 'resume', 'Strictly', 'bounteous', 'SPOUTINGS', 'drowned', 'valuable', 'unobtrusive', 'feathers', 'liquor', 'Nature', 'braided', 'relatively', 'maliciously', 'colour', 'unsplinterable', 'boasting', 'QUITO', 'True', 'streamed', 'predestinating', 'reasonableness', 'Midnight', 'welding', 'Musket', 'pockets', 'arriving', 'liturgies', 'universal', 'yards', 'betokening', 'knitted', 'AFFGHANISTAN', 'thawed', 'swamped', 'recumbent', 'specialities', 'harmed', 'disconnected', 'dun', 'poniard', 'linked', 'leaded', 'Straits', 'congeal', 'daft', 'took', 'unstaggering', 'festivities', 'Gospel', 'scratch', 'Elijah', 'rearward', 'seemingly', 'hatchway', 'White', 'outriders', 'unconsciousness', 'pace', 'Applied', 'wheresoe', 'queen', 'hollow', 'clothing', 'confounded', 'objection', 'painfully', 'flock', 'besooted', 'painfulness', 'nightmare', 'h', 'faintly', 'joyous', 'prints', 'Bremen', 'agonies', 'elongated', 'foreboding', 'intervened', 'Erie', 'Arkansas', 'College', 'bass', 'compile', 'steel', 'dimensioned', 'subservient', 'savagery', 'letting', 'planks', 'lathering', 'heterogeneously', 'hobbling', 'cucumbers', 'popular', 'barbs', '---"', 'scene', 'person', 'defection', 'Enough', 'WALTER', 'shroud', 'dull', 'Shirt', 'distinctive', 'colony', 'yawned', 'caught', 'migrations', 'gudgeons', 'Owen', 'Reference', 'Physiognomist', 'clefts', 'Socratic', 'woodcock', 'wonders', 'interpreting', 'wolfish', 'shouted', 'gestation', 'Bouton', 'gazed', 'strata', 'coloured', 'cleets', 'Customs', 'unspeakable', 'angles', 'flutterings', 'cognisable', '13', 'repelling', 'rafted', 'quicken', 'idol', 'summer', 'free', 'froze', 'Mab', 'citations', 'capital', 'smelt', 'Whew', '18', 'nursing', 'couldn', 'worryings', 'widow', 'de', 'thanked', 'liv', 'obedient', 'cutlery', 'solved', 'preliminary', 'merchants', 'past', 'mended', 'hummed', 'tendency', 'natural', 'intact', 'insensibly', 'Leviathanism', 'agrarian', 'Common', 'crystals', 'quietest', 'sunk', 'rib', 'skip', 'humiliation', 'glarings', 'kneel', 'exhaustive', 'madest', 'mistrust', 'humble', 'laborers', 'abated', 'Jesus', 'carpenters', 'Lamatins', 'working', 'inhaled', 'furnace', 'diametrically', 'assert', 'layn', 'raked', 'contemplations', 'bulky', 'shuffle', 'slanting', 'trimming', 'dozen', 'thirteen', 'boldly', 'wager', 'powdered', 'loudly', 'privation', 'rascals', 'clasp', 'economically', 'refuse', 'excludes', 'cents', 'drought', 'Cooks', 'collection', 'Red', 'Canallers', 'pricking', 'endures', 'Mapple', 'traditional', 'walrus', 'cried', 'deprecatory', 'Gibraltar', 'ubiquity', 'chowder', 'EYES', 'IAN', 'expansion', 'foe', 'Mississippi', 'overlook', 'flanking', 'evaporate', 'pulse', 'discovery', 'unforseen', 'overmanned', 'flooded', 'medal', 'vortex', 'chrysalis', 'everyway', 'Amsterdam', 'capping', 'perchance', 'laughter', 'predicted', 'far', 'arboring', 'puzzling', 'average', 'slapping', 'beats', 'disembowelments', 'cannibalistically', 'hooroosh', 'admonish', 'Snatch', 'Led', 'ability', "!'", 'grin', 'misgiving', 'tanned', 'wet', 'Junk', 'spouts', 'why', 'suppress', 'fancying', 'sung', 'glass', 'recover', 'pursue', 'firewood', 'Eastern', 'quiescence', 'congregation', '.*--', 'tongue', 'thriving', 'disputes', 'tiny', 'Frighted', 'junior', 'wrapt', 'unneeded', 'Loaded', 'job', 'hear', 'gamesomeness', 'LIGHTNING', 'unmanufactured', 'roast', 'Lionel', 'tracery', 'backwoods', 'hues', 'funereal', 'furrowed', 'Fields', 'rounder', 'interruption', 'youngish', 'piece', 'relieve', 'endlessly', 'reflection', 'unceasing', 'prize', 'belled', 'flag', 'resident', 'defyingly', 'Among', 'patronising', 'bosom', 'flowery', 'waistband', 'stuff', 'marble', 'oust', 'Mysticetus', 'tandem', 'brigs', 'haired', 'sprout', 'trace', 'Spread', 'speedy', 'buck', 'tufted', 'evidence', 'Sam', 'lightly', 'HUZZA', 'Seat', 'agonized', 'pens', ':--"', 'Works', 'GONDIBERT', 'dividends', 'abundance', 'furniture', 'tabernacles', 'stuffed', 'faced', 'natures', 'market', 'volunteered', 'Kremlin', 'persuasion', 'whence', '105', 'whom', 'adoration', 'pacing', 'stories', 'excellent', 'blessing', 'bloom', 'central', 'Vermonters', 'breed', 'ask', 'sweat', 'repartees', 'luffs', 'berg', 'harbourless', 'approve', 'Paean', 'carried', 'vane', 'present', 'RAZOR', 'angular', 'discoveries', 'verse', 'owning', 'surveying', 'subsiding', 'infallibly', 'sanity', 'observable', 'exhaled', 'slumbers', 'cudgelling', 'hacked', 'resembled', 'endurance', 'inhabitiveness', 'roomy', 'charging', 'honey', 'seasoning', 'days', 'tortoise', 'newly', 'murmured', 'tear', 'Inserting', 'yell', 'transports', 'misgrown', 'TWISTED', 'visitants', 'thwart', 'between', 'nondescripts', 'eventually', 'assuming', 'peopling', 'jaw', 'theatre', 'perverse', 'Aware', 'examining', 'quadrant', 'comprehensible', 'technically', 'coffins', 'eternal', 'decoction', 'sways', 'recondite', 'immediate', 'lancet', 'yokes', 'submerged', 'craters', 'dividing', 'Round', 'toothache', 'jellied', 'philosophers', 'perennial', 'passant', 'grocers', 'happy', 'tribulations', 'schoolmaster', 'tongues', 'attested', 'upheaved', 'unreliable', 'controversies', 'different', 'truck', 'Dantean', 'Heave', 'palms', 'extracting', 'Spermaceti', 'Sabbath', 'simultaneous', 'recorded', 'slay', 'Back', 'bond', 'molasses', 'startling', 'notified', 'digestive', 'undetected', 'instant', 'rustiness', 'lamb', 'convicts', 'Swedes', 'slumber', "?--'", 'impurities', 'vale', 'flaky', 'Dominic', 'humps', 'unfortunate', 'bumpkins', 'marketless', 'strings', 'plain', 'embark', 'shoots', 'Comparing', 'RABELAIS', 'used', 'blast', 'Disdain', 'accompanies', 'crotchets', 'ESCAPED', 'drawled', 'pouch', 'confine', 'tack', 'well', 'pennant', 'mannikin', 'census', 'originated', 'sanctuary', 'check', 'legislators', 'presence', 'majestically', 'ESSEX', 'music', 'among', 'tackles', 'large', 'posterity', 'stripped', 'Railroads', 'turns', 'soliloquized', 'shoved', 'Drink', 'legendary', 'concerns', 'protect', 'excitement', 'Crish', 'mustn', 'quaff', 'prime', 'alive', 'besieged', 'surly', 'Borean', 'Animated', 'tiller', 'engrossed', 'sleepless', 'Depend', 'Touching', 'beauty', 'tale', 'Summer', 'hawk', 'tranquillity', 'ornament', 'tragic', 'anvil', 'Tomahawk', 'detailed', 'screen', 'unsuspected', 'roods', 'Do', 'aforesaid', 'Lifted', 'Square', 'unvarying', 'Easy', 'cannikin', 'honours', 'slowly', 'Anatomist', 'Book', 'seaman', 'chill', 'help', 'roll', 'indicated', 'calves', 'quickening', 'prove', 'acquaintances', 'helmsman', 'almighty', 'engendering', 'wards', 'rugged', 'washing', 'sinister', 'ranging', 'nowise', 'treadle', 'vats', 'Ere', 'fry', 'exploit', 'reflections', 'allurements', 'obscurity', 'steer', 'Wrapped', 'devoutly', 'imagine', 'prudently', '73', 'horrors', 'protracted', 'interest', 'concern', 'muskets', 'BOOKS', 'States', 'rascally', 'geometry', 'chimney', 'thousandth', 'bier', 'cronies', 'extract', 'spermaceti', 'usages', 'Carpet', 'darkness', 'convictions', 'foregone', 'Subs', 'Baliene', 'ache', 'SIXTH', 'perry', 'Berlin', 'chase', 'remembered', 'colder', 'inhabitants', 'spilled', 'dents', 'Superior', 'Italy', 'severed', 'brunt', 'commissioned', 'cowhide', 'populous', 'twigging', 'dishonour', 'tray', 'defunct', 'dissolutions', 'multitudinously', 'ISOLATO', 'retires', 'touches', 'vibrated', 'unsourced', 'Bad', 'single', 'MEMORY', 'fa', 'flints', 'cataract', 'TOOKE', 'crouching', 'despite', 'Hydrus', 'treatise', 'mong', 'Kraken', 'outs', 'virtuous', 'obey', 'boots', 'urbane', 'string', 'thirteenth', 'millions', 'complicated', 'dreaminess', 'distances', 'covers', 'MOVES', 'blacks', 'gained', 'horrid', 'Me', 'Was', 'fearless', 'fares', 'creamy', 'mobbed', 'jumping', 'enthrone', 'indebted', 'Avast', 'Venice', 'lurking', 'price', 'sum', 'couple', 'Tail', 'investigations', 'mightest', 'skeletons', 'obedience', 'feverishly', 'ornamented', 'Gurry', 'unprovided', 'humoredly', 'scheming', 'Delight', 'ocean', 'Tom', 'stepmother', 'besmoked', 'antelope', 'promptitude', 'oceans', 'coined', 'missionaries', 'fixture', 'confess', 'Stand', 'fancies', 'separated', 'charger', 'WAVING', 'emblem', 'pecks', 'traditions', 'GREEK', 'rude', 'faithfully', 'chronically', 'militia', 'imminent', 'happened', 'Captains', 'pomp', 'rollings', 'fluttering', 'slaughtered', "--'", 'risked', 'fought', 'jeering', 'argosy', 'preacher', 'composing', 'uncomfortableness', 'maidenly', 'snort', 'asses', 'infancy', 'undone', 'wound', 'conservatories', 'Twas', 'piratical', 'effulgences', 'unbutton', 'hereditarily', 'impelled', 'final', 'azure', 'honourableness', 'coax', 'wrinkling', 'unchristian', 'Haarlem', 'cascading', 'verbalists', 'heavenly', 'frugal', 'witnesses', 'Rensselaers', 'clove', 'REPEATED', 'splice', 'liest', 'lightest', 'surcoat', 'chatting', 'prosecuted', 'consisted', 'TIMES', 'magnitude', 'Met', 'Names', 'Pudding', 'scattered', 'comprehensive', 'greenish', 'unwaning', 'hiss', 'maggots', 'deliver', 'flexion', 'thence', 'undressing', 'strenuous', 'receptacle', 'vacancy', 'attacks', 'cuffs', 'Gently', 'squid', 'separating', 'Already', 'bisons', 'rabble', 'Joppa', 'gentlemanlike', 'spiracles', 'Butchers', 'GREENLAND', 'search', 'other', 'task', 'starve', 'blindest', 'intensifying', 'supposed', 'Dome', 'embalming', 'SLOWLY', 'ease', 'Pontoppodan', 'Thirty', 'brow', 'Iceland', 'singing', 'clever', 'symmetrically', 'smallness', 'coronation', 'Iceberg', 'le', 'bomb', 'political', 'yield', 'superficially', 'maiming', 'fishermen', 'crucified', 'echoes', 'haven', 'Saul', 'invite', 'died', 'crouch', 'Judges', '16', 'bolted', 'wouldn', 'unbidden', 'headmost', 'Spurzheim', 'humming', 'longest', ...}
set(word.lower() for word in text1)
{'bottomless', 'incoherences', 'sphinx', 'saxon', 'hypos', 'magnificent', 'insulated', 'sinker', 'matter', 'rules', 'idolator', 'romantic', 'invent', 'gloss', 'volley', 'tested', 'produces', 'talkest', 'goggling', 'cask', 'conceives', 'shafts', 'noises', 'accept', 'reluctantly', 'veracity', 'joyfully', 'nodded', 'tint', 'loan', 'corrupt', 'through', 'bulge', 'seeva', 'consequent', 'savesoul', 'respects', 'remonstrating', 'mannerly', 'circumambulate', 'parchingly', 'scolloped', 'confronted', 'modelled', 'hugeness', 'lords', 'triumphal', 'aspect', 'hell', 'arter', 'intimate', 'thrice', 'chowders', 'guard', 'fondly', 'hopeful', 'net', 'nurseries', 'inquiring', 'bantering', 'tablecloth', 'hatched', 'whist', 'despatch', 'twos', 'beds', 'impeach', 'numbered', 'few', 'cripple', 'spreading', 'primer', 'blooded', 'lapse', 'credulous', 'javelin', 'horned', 'swimming', 'discoloured', 'stains', 'mizen', 'contrary', 'inventor', 'gardenny', 'flatter', 'gainsaid', 'abasement', 'kindred', 'dimensions', 'protesting', 'overpowered', 'reduced', 'fiercer', 'squared', 'close', 'uninterrupted', 'drills', 'prostrate', 'description', 'firmaments', 'nonplussed', 'haze', 'catched', 'soggy', 'travelling', 'ober', 'stands', 'thereby', 'considerable', 'barbaric', 'signing', 'enclosed', 'clerical', 'grunt', 'clothes', 'tugged', 'keys', 'pantaloons', ',', 'auxiliary', 'panoramas', 'against', 'afore', 'unquestionable', 'foreground', 'patchwork', 'dedication', 'straggling', 'comprehending', 'serpentines', 'maledictions', 'swiss', 'throttle', 'winter', 'empires', 'principles', 'vision', 'believer', 'along', 'feasts', 'resume', 'bounteous', 'judith', 'drowned', 'valuable', 'unobtrusive', 'feathers', 'liquor', 'canaris', 'braided', 'relatively', 'maliciously', 'colour', 'himmalehs', 'unsplinterable', 'boasting', 'streamed', 'predestinating', 'reasonableness', 'pockets', 'arriving', 'mungo', 'universal', 'liturgies', 'yards', 'betokening', 'knitted', 'thawed', 'swamped', 'decapitation', 'recumbent', 'specialities', 'harmed', 'disconnected', 'dun', 'poniard', 'linked', 'leaded', 'congeal', 'daft', 'took', 'unstaggering', 'festivities', 'eliza', 'boone', 'scratch', 'rearward', 'seemingly', 'hatchway', 'outriders', 'unconsciousness', 'pace', 'wheresoe', 'queen', 'hollow', 'clothing', 'confounded', 'objection', 'bendigoes', 'painfully', 'flock', 'besooted', 'painfulness', 'nightmare', 'h', 'faintly', 'joyous', 'prints', 'agonies', 'elongated', 'foreboding', 'intervened', 'christians', 'bass', 'compile', 'steel', 'dimensioned', 'marmora', 'savagery', 'letting', 'subservient', 'planks', 'lathering', 'heterogeneously', 'hobbling', 'cucumbers', 'zoroaster', 'popular', 'barbs', '---"', 'scene', 'person', 'defection', 'shroud', 'dull', 'distinctive', 'yawned', 'colony', 'gudgeons', 'caught', 'migrations', 'clefts', 'woodcock', 'wonders', 'interpreting', 'wolfish', 'shouted', 'gestation', 'gazed', 'strata', 'coloured', 'cleets', 'angles', 'unspeakable', 'flutterings', 'cognisable', '13', 'quakeress', 'greece', 'repelling', 'rafted', 'quicken', 'idol', 'summer', 'free', 'froze', 'hampshire', 'citations', 'capital', 'rinaldini', 'smelt', '18', 'nursing', 'couldn', 'worryings', 'widow', 'de', 'thanked', 'liv', 'obedient', 'cutlery', 'solved', 'preliminary', 'tarquin', 'merchants', 'past', 'mended', 'hummed', 'tendency', 'natural', 'intact', 'insensibly', 'japan', 'agrarian', 'crystals', 'quietest', 'sunk', 'rib', 'skip', 'humiliation', 'grisly', 'glarings', 'kneel', 'exhaustive', 'madest', 'humble', 'mistrust', 'laborers', 'abated', 'carpenters', 'inhaled', 'working', 'furnace', 'diametrically', 'acushnet', 'assert', 'layn', 'raked', 'contemplations', 'bulky', 'shuffle', 'slanting', 'trimming', 'dozen', 'thirteen', 'boldly', 'wager', 'hadst', 'loudly', 'powdered', 'rascals', 'privation', 'clasp', 'economically', 'refuse', 'excludes', 'cents', 'drought', 'collection', 'pricking', 'endures', 'traditional', 'ubiquity', 'walrus', 'cried', 'deprecatory', 'haggardly', 'chowder', 'expansion', 'foe', 'evaporate', 'overlook', 'flanking', 'pulse', 'flooded', 'discovery', 'unforseen', 'overmanned', 'vortex', 'medal', 'chrysalis', 'everyway', 'capping', 'laughter', 'perchance', 'predicted', 'far', 'arboring', 'puzzling', 'average', 'slapping', 'beats', 'disembowelments', 'cannibalistically', 'hooroosh', 'admonish', 'ability', "!'", 'grin', 'misgiving', 'tanned', 'wet', 'spouts', 'why', 'lapland', 'suppress', 'fancying', 'sung', 'davy', 'glass', 'recover', 'pursue', 'firewood', 'disputes', 'quiescence', 'congregation', '.*--', 'tongue', 'ombay', 'thriving', 'tiny', 'junior', 'wrapt', 'unneeded', 'job', 'hear', 'multum', 'gamesomeness', 'ka', 'unmanufactured', 'roast', 'tracery', 'backwoods', 'hues', 'funereal', 'furrowed', 'rounder', 'interruption', 'youngish', 'piece', 'relieve', 'endlessly', 'reflection', 'unceasing', 'prize', 'belled', 'flag', 'resident', 'scripture', 'defyingly', 'patronising', 'bosom', 'flowery', 'waistband', 'stuff', 'marble', 'oust', 'tandem', 'brigs', 'haired', 'sprout', 'trace', 'speedy', 'buck', 'goa', 'tufted', 'evidence', 'lightly', 'agonized', 'pens', ':--"', 'dividends', 'abundance', 'furniture', 'tabernacles', 'stuffed', 'faced', 'market', 'volunteered', 'persuasion', 'whence', 'garnery', '105', 'whom', 'adoration', 'pacing', 'stories', 'excellent', 'blessing', 'barbary', 'bloom', 'central', 'breed', 'sweat', 'ask', 'repartees', 'approve', 'luffs', 'berg', 'harbourless', 'carried', 'singed', 'vane', 'present', 'congregational', 'angular', 'discoveries', 'verse', 'owning', 'surveying', 'subsiding', 'infallibly', 'sanity', 'observable', 'exhaled', 'slumbers', 'cudgelling', 'hacked', 'resembled', 'endurance', 'inhabitiveness', 'roomy', 'charging', 'honey', 'bennett', 'days', 'seasoning', 'tortoise', 'vide', 'newly', 'murmured', 'ireland', 'tear', 'yell', 'transports', 'misgrown', 'visitants', 'thwart', 'between', 'nondescripts', 'eventually', 'assuming', 'peopling', 'jaw', 'theatre', 'perverse', 'examining', 'quadrant', 'comprehensible', 'coffins', 'technically', 'eternal', 'decoction', 'anyway', 'sways', 'sag', 'immediate', 'recondite', 'lancet', 'yokes', 'submerged', 'craters', 'dividing', 'toothache', 'jellied', 'philosophers', 'perennial', 'passant', 'grocers', 'happy', 'tribulations', 'schoolmaster', 'tongues', 'attested', 'perseus', 'andrew', 'unreliable', 'controversies', 'different', 'truck', 'upheaved', 'palms', 'extracting', 'simultaneous', 'recorded', 'slay', 'bond', 'molasses', 'startling', 'notified', 'digestive', 'undetected', 'instant', 'belfast', 'lamb', 'convicts', 'rustiness', 'slumber', "?--'", 'impurities', 'vale', 'flaky', 'humps', 'unfortunate', 'bumpkins', 'marketless', 'strings', 'plain', 'embark', 'hindoos', 'shoots', 'used', 'blast', 'accompanies', 'crotchets', 'drawled', 'pouch', 'confine', 'tack', 'well', 'pennant', 'mannikin', 'census', 'originated', 'sanctuary', 'check', 'legislators', 'presence', 'majestically', 'music', 'among', 'tackles', 'large', 'posterity', 'stripped', 'turns', 'soliloquized', 'shoved', 'legendary', 'protect', 'concerns', 'excitement', 'mustn', 'quaff', 'spurn', 'alive', 'prime', 'besieged', 'surly', 'tiller', 'engrossed', 'sleepless', 'christian', 'beauty', 'tale', 'johnny', 'ahasuerus', 'roman', 'ornament', 'tragic', 'anvil', 'archy', 'detailed', 'screen', 'unsuspected', 'tranquillity', 'roods', 'aforesaid', 'leyden', 'unvarying', 'cannikin', 'gesner', 'honours', 'slowly', 'dantean', 'seaman', 'chill', 'help', 'roll', 'indicated', 'calves', 'quickening', 'strafford', 'prove', 'acquaintances', 'emir', 'helmsman', 'southerner', 'almighty', 'engendering', 'wards', 'rugged', 'washing', 'sinister', 'ranging', 'nowise', 'treadle', 'navis', 'vats', 'fry', 'exploit', 'reflections', 'allurements', 'obscurity', 'albemarle', 'steer', 'devoutly', 'imagine', 'prudently', '73', 'horrors', 'protracted', 'interest', 'concern', 'muskets', 'rascally', 'geometry', 'chimney', 'thousandth', 'bier', 'cronies', 'extract', 'spermaceti', 'usages', 'convictions', 'darkness', 'foregone', 'ache', 'perry', 'chase', 'remembered', 'noun', 'colder', 'inhabitants', 'spilled', 'dents', 'requisition', 'twigging', 'severed', 'brunt', 'commissioned', 'cowhide', 'populous', 'tray', 'dishonour', 'subordinates', 'defunct', 'dissolutions', 'multitudinously', 'daggoo', 'retires', 'touches', 'vibrated', 'unsourced', 'single', 'fa', 'flints', 'ordinaire', 'cataract', 'despite', 'crouching', 'treatise', 'mong', 'outs', 'virtuous', 'obey', 'boots', 'urbane', 'string', 'thirteenth', 'millions', 'complicated', '5th', 'dreaminess', 'distances', 'niphon', 'covers', 'blacks', 'gained', 'horrid', 'fares', 'fearless', 'creamy', 'mobbed', 'jumping', 'enthrone', 'indebted', 'lurking', 'price', 'sum', 'couple', 'investigations', 'mightest', 'skeletons', 'obedience', 'feverishly', 'ornamented', 'unprovided', 'humoredly', 'scheming', 'antelope', 'ocean', 'stepmother', 'promptitude', 'besmoked', 'oceans', 'coined', 'missionaries', 'fixture', 'confess', 'separated', 'fancies', 'charger', 'puritanic', 'emblem', 'pecks', 'traditions', 'faithfully', 'rude', 'chronically', 'militia', 'imminent', 'happened', 'pomp', 'northman', 'rollings', 'fluttering', 'slaughtered', "--'", 'risked', 'fought', 'jeering', 'argosy', 'preacher', 'composing', 'uncomfortableness', 'maidenly', 'snort', 'asses', 'infancy', 'undone', 'wound', 'conservatories', 'effulgences', 'piratical', 'unbutton', 'hereditarily', 'impelled', 'final', 'azure', 'honourableness', 'coax', 'wrinkling', 'unchristian', 'cascading', 'verbalists', 'heavenly', 'frugal', 'witnesses', 'dunfermline', 'clove', 'ganges', 'splice', 'liest', 'lightest', 'surcoat', 'chatting', 'prosecuted', 'consisted', 'magnitude', 'cruppered', 'greenish', 'scattered', 'comprehensive', 'unwaning', 'hiss', 'maggots', 'deliver', 'sikoke', 'flexion', 'thence', 'undressing', 'strenuous', 'receptacle', 'vacancy', 'attacks', 'cuffs', 'squid', 'separating', 'bisons', 'gentlemanlike', 'rabble', 'spiracles', 'search', 'other', 'task', 'starve', 'blindest', 'intensifying', 'supposed', 'englishman', 'embalming', 'ease', 'brow', 'singing', 'clever', 'symmetrically', 'smallness', 'coronation', 'le', 'bomb', 'political', 'yield', 'superficially', 'maiming', 'fishermen', 'crucified', 'ceylon', 'echoes', 'haven', 'invite', 'died', 'crouch', '16', 'bolted', 'wouldn', 'unbidden', 'headmost', 'humming', 'longest', 'oaken', 'pitchpoler', 'amounted', 'hush', 'shift', 'ladder', 'specifically', 'bravadoes', 'accuracy', 'makings', 'restored', 'crystal', 'heaps', 'ejaculation', 'slew', 'presbyterians', 'fickleness', 'bowed', 'destroying', 'premeditated', 'robe', 'unbodied', 'odds', 'depicting', 'crim', 'decidedly', 'tomb', 'weed', 'essences', 'abortion', 'thereabouts', 'stirring', 'numbed', 'fowl', 'caw', 'jungfrau', 'swell', 'butts', 'mapple', 'imparted', 'fever', 'homeless', 'expose', 'uncheered', 'though', 'catalogue', 'pip', 'hate', 'rover', 'catskill', 'punishment', 'goes', 'rogue', 'swore', 'glancing', 'tasks', 'elephanta', 'clews', 'stanzas', 'torrid', 'fork', 'stupor', 'complain', 'oders', 'complimentary', 'drunken', 'hereafter', 'benjamin', 'gums', 'uncouth', 'mild', 'abhorred', 'compactness', 'cambrics', 'former', 'mogulship', 'predecessor', 'sashless', 'situated', 'oath', 'rumors', 'ignoring', 'rightly', 'external', 'labyrinth', 'marbleized', 'saint', 'islets', 'shortness', 'domineering', ...}
sent1= ['Chae', 'is', 'Chae', '.' ]
for xy in sent1:
if xy.endswith('e'):
print(xy)
Chae Chae
for xy in sent1:
if len(xy)>2:
print(xy)
Chae Chae
for xy in sent1:
if len(xy)>=2:
print(xy)
Chae is Chae
for token in sent1:
if token.islower():
print(token, 'is a lowercase word.')
elif token.istitle():
print(token, 'is a title word.')
else:
print(token, 'is a punctuation.')
Chae is a title word. is is a lowercase word. Chae is a title word. . is a punctuation.
tricky = sorted(w for w in set(text2) if 'cie' in w or 'cei' in w)
for word in tricky:
print(word)
ancient ceiling conceit conceited conceive conscience conscientious conscientiously deceitful deceive deceived deceiving deficiencies deficiency deficient delicacies excellencies fancied insufficiency insufficient legacies perceive perceived perceiving prescience prophecies receipt receive received receiving society species sufficient sufficiently undeceive undeceiving
for word in tricky:
print(word, end=' ^0^ ')
ancient ^0^ ceiling ^0^ conceit ^0^ conceited ^0^ conceive ^0^ conscience ^0^ conscientious ^0^ conscientiously ^0^ deceitful ^0^ deceive ^0^ deceived ^0^ deceiving ^0^ deficiencies ^0^ deficiency ^0^ deficient ^0^ delicacies ^0^ excellencies ^0^ fancied ^0^ insufficiency ^0^ insufficient ^0^ legacies ^0^ perceive ^0^ perceived ^0^ perceiving ^0^ prescience ^0^ prophecies ^0^ receipt ^0^ receive ^0^ received ^0^ receiving ^0^ society ^0^ species ^0^ sufficient ^0^ sufficiently ^0^ undeceive ^0^ undeceiving ^0^
import nltk
#nltk.chat.chatbots()