Kindly Note: If you had any programmatic error, please comment below.
LAB 1: Welcome to NLP Using Python - Simple Text Operation.
Question 1: Hands on NLP Python simple text operations - 1
Solution 1:
#!/bin/python3 #Write your code here import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data" from nltk.corpus import gutenberg from nltk.text import Text # # Complete the 'calculateWordCounts' function below. # def calculateWordCounts(text): # Write your code here # Task 1: # Find the number of words in 'text', and print the result. total_word= len(text) print(total_word) # Task 2: # Find the number of unique words in 'text', and print the result. unique_word= len(set(text)) print(unique_word) # Task 3: # Calculate the word coverage of 'text' obtained from the number of words and number of unique words,and print the result. average= total_word/unique_word print(math.floor(average)) if __name__ == '__main__': text = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) text = Text(gutenberg.words(text)) calculateWordCounts(text)
Question 2: Hands on NLP Python simple text operations - 2
Solution 2:
#!/bin/python3 #Write your code here import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data" from nltk.corpus import gutenberg from nltk.text import Text # # Complete the 'filterWords' function below. # def filterWords(text): # Write your code here # Task 1: # Filter the words ending with 'ing' from the set of unique words of 'text', and store into 'ing_words' variable as a list. unique_word= set(text) ing_words= [ words for words in unique_word if words.endswith('ing') ] # Task 2: # Filter the words whose length is greater then 15 from the complete set of 'text', and store into 'large_words' variable as a list. large_words= [ words for words in text if len(words.strip())>15] # Task 3: # Filter the words having all letters in upper case from the set of unique words of 'text', and store into 'upper_words' variable as a list. upper_words= [ words for words in unique_word if (type(words) == str and words.isupper())] return ing_words,large_words,upper_words if __name__ == '__main__': text = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) text = Text(gutenberg.words(text)) ing_words, big_words, upper_words = filterWords(text) print(sorted(ing_words)) print(sorted(big_words)) print(sorted(upper_words))
Question 3: Hands on NLP Python simple text operations - 3.
Solution 3:
#!/bin/python3 #Write your code here #!/bin/python3 import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data" from nltk.corpus import gutenberg from nltk.text import Text import nltk # # Complete the 'findWordFreq' function below. # # def findWordFreq(text, word): # Write your code here # Task 1: # Find the frequency for the given 'word', and store it into the variable 'wordfreq'. textfreq = [word for word in text if word.isalpha()] FreqDist = nltk.FreqDist(textfreq) wordfreq= FreqDist[word] # Task 2: # Find the word which has a maximum frequency from the 'textfreq', and store into the variable 'maxfreq'. max_value = max(FreqDist.values()) maxfreq= [ item for item in FreqDist if FreqDist[item]== max_value][0] return wordfreq, maxfreq if __name__ == '__main__': text = input() word = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) text = Text(gutenberg.words(text)) word_freq, max_freq = findWordFreq(text, word) print(word_freq) print(max_freq)
LAB 2: Welcome to NLP Using Python - Accessing Text Corpora
Question 1: Hands on - NLP Python accessing text corpora.
Solution 1: NLP Python accessing text corpora.
#!/bin/python3 #Write your code here #!/bin/python3 import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data" import nltk # # Complete the 'accessTextCorpora' function below. # # The function accepts following parameters: # 1. STRING fileid # 2. STRING word # from nltk.corpus import inaugural def accessTextCorpora(fileid, word): # Write your code here # Task 1: # Compute the word coverage for the given 'field' associated with the text corpus 'Inaugural', and store the result into 'wordcoverage'. text_to_word_list_convert = inaugural.words(fileid) wordfreq = nltk.FreqDist(text_to_word_list_convert) unique_word = wordfreq.items() unique_word_length = len(unique_word) total_words = list(wordfreq.elements()) total_words_length = len(list(wordfreq.elements())) wordcoverage = int(total_words_length/ unique_word_length) # Task 2: # Filter the words ending with 'ed' from the set of unique words for the given 'fileid' of 'Inaugural' corpus, and store it into 'ed_words' variable as a list. ed_words= [ word for word in set(total_words) if word.endswith("ed") ] # Task 3: # Convert all the words into 'lowercase'. Determine the frequency distribution of all the words having only alphabets for the given 'fileid' of 'Inaugural' corpus, and store it into a variable 'textfreq'. Find the frequency for the given 'word', and store it into 'wordfreq'. whole_word_lower= [ word.lower() for word in total_words if word.isalpha()] wordfreq=whole_word_lower.count(word) # Return wordcoverage, ed_words ,wordfreq return wordcoverage, ed_words ,wordfreq if __name__ == '__main__': fileid = input() word = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) word_coverage, ed_words, word_freq = accessTextCorpora(fileid, word) print(word_coverage) print(sorted(ed_words)) print(word_freq)
Question 2: Hands on NLP python user specific text corpora.
Solution 2:
#!/bin/python3 #Write your code here #!/bin/python3 import math import os import random import re import sys import nltk # # Complete the 'createUserTextCorpora' function below. # # The function accepts following parameters: # 1. STRING filecontent1 # 2. STRING filecontent2 # # from nltk.corpus import inaugural from nltk.corpus import PlaintextCorpusReader def createUserTextCorpora(filecontent1, filecontent2): # Write your code here # Task 1: # Create a text file name called 'content1.txt', and write content 'filecontent1' inside the nltk_data folder. # with open(os.path.join('nltk_data/', 'content1.txt'), "w") as f: f.write(filecontent1) f.close() # Task 2: # Create a text file name called 'content2.txt', and write content 'filecontent2' inside the nltk_data folder. # with open(os.path.join('nltk_data/', 'content2.txt'), "w") as f2: f2.write(filecontent2) f2.close() # Task 3: # Convert your collection of text files inside the 'nltk_data' folder into a text corpus, store it into the 'text_corpus' variable. # corpus_root_directory = 'nltk_data/' text_corpus = PlaintextCorpusReader(corpus_root_directory,r'.*\.txt') p1= text_corpus.words('content1.txt') p2= text_corpus.words('content2.txt') # Task 4: # Compute the number of words and number of unique words of all the file IDs associated with the text corpus, # And store into 'no_of_words_corpus1', 'no_of_unique_words_corpus1', 'no_of_words_corpus2', 'no_of_unique_words_corpus2', variable. # wordfreq = nltk.FreqDist(p1) g= list(wordfreq.elements()) no_of_words_corpus1 = len(g) no_of_unique_words_corpus1 = len(set(g)) wordfreq2 = nltk.FreqDist(p2) g2= list(wordfreq2.elements()) no_of_words_corpus2 = len(g2) no_of_unique_words_corpus2 = len(set(g2)) return text_corpus, no_of_words_corpus1, no_of_unique_words_corpus1, no_of_words_corpus2, no_of_unique_words_corpus2 if __name__ == '__main__': filecontent1 = input() filecontent2 = input() path = os.path.join(os.getcwd(), "nltk_data") os.makedirs(path, exist_ok=True) for file in os.listdir(path): os.remove(path+"\\"+file) text_corpus, no_of_words_corpus1, no_of_unique_words_corpus1, no_of_words_corpus2, no_of_unique_words_corpus2 = createUserTextCorpora(filecontent1, filecontent2) expected_corpus_files = ['content1.txt', 'content2.txt'] if type(text_corpus) == nltk.corpus.reader.plaintext.PlaintextCorpusReader and sorted(list(text_corpus.fileids())) == expected_corpus_files: print(no_of_words_corpus1) print(no_of_unique_words_corpus1) print(no_of_words_corpus2) print(no_of_unique_words_corpus2)
LAB 3: Welcome to NLP Using Python - Conditional Frequency Distribution.
Problem 3: Hands-On: Conditional Frequency.
Solution : Conditional Frequency
#!/bin/python3 #Write your code here #!/bin/python3 import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd()+"/nltk_data" import nltk # # Complete the 'calculateCFD' function below. # # The function accepts following parameters: # 1. STRING_ARRAY cfdconditions # 2. STRING_ARRAY cfdevents # from nltk.corpus import brown, stopwords # nltk.download('brown') def calculateCFD(cfdconditions, cfdevents): # Task 1: # Determine the conditional frequency of all the words (convert into lower case and remove all the stop words) for the given category 'cfdconditions' of the brown corpora. # Store the result in 'cdev_cfd'. cfd = nltk.ConditionalFreqDist([ (genre, word.lower()) for genre in cfdconditions for word in brown.words(categories=genre) ]) cfd.tabulate(conditions=cfdconditions, samples=cfdevents) cdev_cfd= set(stopwords.words('english')) # Task 2: Determine the words ending with 'ing' or 'ed'. Compute conditional frequency distribution, where the condition is 'cfdconditions', and the event is either 'ing' or 'ed'. # Store the conditional frequency distribution in the variable 'inged_cfd'. temp_genre = [ (genre, word.lower()) for genre in brown.categories() for word in brown.words(categories=genre) if (word.lower().endswith('ing') or word.lower().endswith('ed'))] generc_word_list= [list(x) for x in temp_genre ] for wd in generc_word_list: if wd[1].endswith('ing') and wd[1] not in cdev_cfd: wd[1]= 'ing' elif wd[1].endswith('ed') and wd[1] not in cdev_cfd: wd[1] = 'ed' inged_cfd = nltk.ConditionalFreqDist(generc_word_list) inged_cfd.tabulate(conditions = cfdconditions, samples=['ed', 'ing']) if __name__ == '__main__': cfdconditions_count = int(input().strip()) cfdconditions = [] for _ in range(cfdconditions_count): cfdconditions_item = input() cfdconditions.append(cfdconditions_item) cfdevents_count = int(input().strip()) cfdevents = [] for _ in range(cfdevents_count): cfdevents_item = input() cfdevents.append(cfdevents_item) if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) calculateCFD(cfdconditions, cfdevents)
LAB 4: Welcome to NLP Using Python - Processing Raw Text.
Problem 4: Hands on Stemming
Solution:
#!/bin/python3 #Write your code here #!/bin/python3 import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data" import nltk # # Complete the 'processRawText' function below. # # The function accepts STRING textURL as parameter. # from urllib import request def processRawText(textURL): # Write your code here # Task 1: # Read the text content from the given link 'textURL'. Store the content in the variable 'textcontent'. # url = textURL textcontent = request.urlopen(url).read() # Task 2: # Tokenize all the words in the 'textcontent', and convert them into lower case. Store the tokenized list of words in 'tokenizedlcwords'. # text_content1 = textcontent.decode('unicode_escape') # Converts bytes to unicode tokens1 = nltk.word_tokenize(text_content1) wordfreq_m= nltk.FreqDist(tokens1) tokenizedlcwords = [ word.lower() for word in tokens1 ] # Task 3: # Find the number of words in 'tokenizedlcwords', and store the result in 'noofwords'. # noofwords = len(tokenizedlcwords) # Task 4: # Find the number of unique words in 'tokenizedlcwords', and store the result in 'noofunqwords'. # noofunqwords = len(set( tokenizedlcwords)) # Task 5 : # Calculate the word coverage of 'tokenizedlcwords' obtained from the number of words and number of unique words, Store the result in the 'wordcov'. # wordcov = int( noofwords / noofunqwords) # Task 6: # Determine the frequency distribution of all words having only alphabets in 'tokenizedlcwords'. Store the result in the variable 'wordfreq'. wordfreq_list = [ word for word in tokens1 if word.isalpha() ] wordfreq= len(wordfreq_list) # Task 7 # Find the maximum frequent word of 'tokenizedlcwords'. Store the result in the variable 'maxfreq' dict_of_word_frequency = { x: tokenizedlcwords.count(x) for x in tokenizedlcwords } maxfreq = max(dict_of_word_frequency, key= dict_of_word_frequency.get) # print() return noofwords, noofunqwords, wordcov,maxfreq if __name__ == '__main__': textURL = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) noofwords, noofunqwords, wordcov, maxfreq = processRawText(textURL) print(noofwords) print(noofunqwords) print(wordcov) print(maxfreq)
LAB 5: Welcome to NLP Using Python - Bigrams and Collocations.
Problem 5: Hands on Bigrams, ngrams
Solution:
#!/bin/python3 #Write your code here #!/bin/python3 import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data" import nltk # # Complete the 'performBigramsAndCollocations' function below. # # The function accepts following parameters: # 1. STRING textcontent # 2. STRING word # # from nltk.corpus import stopwords def performBigramsAndCollocations(textcontent, word): # Write your code here # Task 1: Tokenize all the words given in 'textcontent'. The word should contain alphabetes or numbers or underscore. Store the tokenized list of words in 'tokenizedwords' reg_exp= r'([A-Za-z0-9\_]+)' tokenizedwords = nltk.regexp_tokenize(textcontent , pattern= reg_exp) # Task 2: Convert all the words into lowercase. Store the result in 'tokenizedwords'. tokenizedwords = [ word.lower() for word in tokenizedwords ] # Task 3: Compute bigrams of the list 'tokenizedwords'. Store the list of bigrams in 'tokenizedwordsbigrams' tokenizedwordsbigrams = nltk.bigrams(tokenizedwords) # Task 4: # Filter only the bigrams from 'tokenizedwordsbigrams', where the words are not part of 'stopwords'. # Store the result in 'tokenizednonstopwordsbigrams'. sw = set(stopwords.words('english')) temp = list(tokenizedwordsbigrams) sw = set(stopwords.words('english')) tokenizedwordsbigrams = temp # print(tokenizedwordsbigrams) tokenizednonstopwordsbigrams = [ j for j in tokenizedwordsbigrams if j[0] not in sw and j[1] not in sw ] # Task 5: # Compute the Conditional Frequency of 'tokenizednonstopwordsbigrams', where condition and event refer to the word. # Store the result in 'cfd_bigrams'. cfd_bigrams = nltk.ConditionalFreqDist(tokenizednonstopwordsbigrams) # Task 6: # Determine the three most frrerquent words occuring after the given 'word'. # Store the result in 'mostfrequentwordafter'. mostfrequentwordafter = cfd_bigrams[word].most_common(3) # Task 7: # Generate collocations from 'tokenizedwords'. Store list of collocations words in 'collocationwords'. collocation_list = nltk.Text(tokenizedwords).collocation_list() collocationwords= [ " ".join([s1,s2]) for s1,s2 in collocation_list ] return mostfrequentwordafter, collocationwords if __name__ == '__main__': textcontent = input() word = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) mostfrequentwordafter, collocationwords = performBigramsAndCollocations(textcontent, word) print(sorted(mostfrequentwordafter, key=lambda element: (element[1], element[0]), reverse=True)) print(sorted(collocationwords))
LAB 6: Welcome to NLP Using Python - Stemming and Lemmatization.
Problem 6: Hands on Stemming and Lemmatization
Solution:
#!/bin/python3 #Write your code here # LAB 6: # Welcome to NLP Using Python - Stemming and Lemmatization #!/bin/python3 import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd()+"/nltk_data" import nltk # # Complete the 'performStemAndLemma' function below. # # The function accepts STRING textcontent as parameter. from nltk.corpus import stopwords def performStemAndLemma(textcontent): # Write your code here # Task 1: # Task 1: Tokenize all the words given in 'textcontent'. The word should contain alphabetes or numbers or underscore. Store the tokenized list of words in 'tokenizedwords' reg_exp= r'([A-Za-z0-9\_]+)' tokenizedwords = nltk.regexp_tokenize(textcontent , pattern= reg_exp) # Task 2: Convert all the words into lowercase. Store the result in 'tokenizedwords'. tokenizedwords = [ word.lower() for word in set(tokenizedwords) ] # Task 3: # Remove all the stop words from the 'tokenizedwords'. Store the result into the variable 'filteredwords'. sw = set(stopwords.words('english')) filteredwords = [ word for word in tokenizedwords if (word not in sw)] # Task 4: # Stem each word present in 'filteredwords' with 'PorterStemmer', and store the resultin the list 'porterstemmedwords'. porterstemmedwords = [ nltk.PorterStemmer().stem(word) for word in filteredwords ] # Task 5: # Stem each word present in 'filteredwords' with 'LancasterStemmer', and store the resultin the list 'lancasterstemmedwords'. lancasterstemmedwords = [ nltk.LancasterStemmer().stem(word) for word in filteredwords ] # Task 6: # Stem each word present in 'filteredwords' with 'WordNetLemmatizer', and store the resultin the list 'lemmatizedwords'. lemmatizedwords = [ nltk.WordNetLemmatizer().lemmatize(word) for word in filteredwords ] return porterstemmedwords, lancasterstemmedwords, lemmatizedwords if __name__ == '__main__': textcontent = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) porterstemmedwords, lancasterstemmedwords, lemmatizedwords = performStemAndLemma(textcontent) print(sorted(porterstemmedwords)) print(sorted(lancasterstemmedwords)) print(sorted(lemmatizedwords))
LAB 7: Welcome to NLP Using Python - POS Tagging
Problem 7: Hands on POS Tagging.
Solution: Hands on POS Tagging.
#!/bin/python3 #Write your code here import math import os import random import re import sys import zipfile os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data" from nltk.corpus import brown import nltk # # Complete the 'tagPOS' function below. # # The function accepts following parameters: # 1. STRING textcontent # 2. STRING taggedtextcontent # def tagPOS(textcontent, taggedtextcontent, defined_tags): # Write your code here # Task 1: # Tag the part of speech for the given 'textcontent' words, store the result into the variable 'nltk_pos_tags'. words = nltk.word_tokenize(textcontent) nltk_pos_tags= nltk.pos_tag(words) # Task 2: # Tag the part of speech for the given 'taggedtextcontent' words using the 'Tagged Text method'. Store the result into the variable 'tagged_pos_tag'. tagged_pos_tag = [ nltk.tag.str2tuple(word) for word in taggedtextcontent.split() ] # Task 3: # Tag the part of speech for the given 'textcontent' words and use 'defined_tags' as a model in the 'Lookup Tagger method'. Store the result into the variable 'unigram_pos_tag'. # words = nltk.word_tokenize(textcontent) baseline_tagger = nltk.UnigramTagger(model=defined_tags) unigram_pos_tag = baseline_tagger.tag(words) return nltk_pos_tags, tagged_pos_tag, unigram_pos_tag if __name__ == '__main__': textcontent = input() taggedtextcontent = input() if not os.path.exists(os.getcwd() + "/nltk_data"): with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref: zip_ref.extractall(os.getcwd()) defined_tags = dict(brown.tagged_words(tagset='universal')) nltk_pos_tags, tagged_pos_tag, unigram_pos_tag = tagPOS(textcontent, taggedtextcontent, defined_tags) print(nltk_pos_tags) print(tagged_pos_tag) print(unigram_pos_tag)
File_1675368309025