NLP Using Python Fresco Play hands on solution Hacker Rank

NLP Using Python Fresco Play handson solution Raw Text Copora, Conditional Frequency Distribution, Bigrams, Ngrams, Collocations, POS Tagging,Stemming
NLP Using Python Fresco Play hands on solution Hacker Rank - www.pdfcup.com

Kindly Note: If you had any programmatic error, please comment below.

LAB 1: Welcome to NLP Using Python - Simple Text Operation.

Question 1: Hands on NLP Python simple text operations - 1

Solution 1:

#!/bin/python3

#Write your code here


import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data"
from nltk.corpus import gutenberg
from nltk.text import Text

#
# Complete the 'calculateWordCounts' function below.
#

def calculateWordCounts(text):
    # Write your code here
    
    # Task 1:
    # Find the number of words in 'text', and print the result.
    total_word= len(text)
    print(total_word)
    
    # Task 2:
    # Find the number of unique words in 'text', and print the result.
    unique_word= len(set(text))
    print(unique_word)
    
    # Task 3:
    # Calculate the word coverage of 'text' obtained from the number of words and number of unique words,and print the result.
    average= total_word/unique_word
    print(math.floor(average))
    
if __name__ == '__main__':
    text = input()
    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())
    
    text = Text(gutenberg.words(text))

    calculateWordCounts(text)

Question 2: Hands on NLP Python simple text operations - 2

Solution 2:

#!/bin/python3

#Write your code here


import math
import os
import random
import re
import sys
import zipfile

os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data"
from nltk.corpus import gutenberg
from nltk.text import Text


#
# Complete the 'filterWords' function below.
#


def filterWords(text):
    # Write your code here
    
    # Task 1:
    # Filter the words ending with 'ing' from the set of unique words of 'text', and store into 'ing_words' variable as a list.
    unique_word= set(text)
    ing_words= [ words for words in unique_word if words.endswith('ing') ]
     
    # Task 2:
    # Filter the words whose length is greater then 15 from the complete set of 'text', and store into 'large_words' variable as a list.
    large_words= [ words for words in text if len(words.strip())>15]
    
    # Task 3:
    # Filter the words having all letters in upper case from the set of unique words of 'text', and store into 'upper_words' variable as a list.
    upper_words= [ words for words in unique_word if  (type(words) == str and words.isupper())]   
    
    return ing_words,large_words,upper_words

if __name__ == '__main__':
    text = input()
    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())
            
    text = Text(gutenberg.words(text))

    ing_words, big_words, upper_words = filterWords(text)

    print(sorted(ing_words))
    print(sorted(big_words))
    print(sorted(upper_words))

Question 3: Hands on NLP Python simple text operations - 3.

Solution 3:

#!/bin/python3

#Write your code here


#!/bin/python3

import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data"
from nltk.corpus import gutenberg
from nltk.text import Text
import nltk


#
# Complete the 'findWordFreq' function below.
#
# 


def findWordFreq(text, word):
    # Write your code here
    
    # Task 1:
    # Find the frequency for the given 'word', and store it into the variable 'wordfreq'.
    textfreq = [word for word in text if word.isalpha()]
    FreqDist = nltk.FreqDist(textfreq)
    wordfreq=  FreqDist[word]
    
    # Task 2:
    # Find the word which has a maximum frequency from the 'textfreq', and store into the variable 'maxfreq'.
    max_value = max(FreqDist.values())    
    maxfreq= [ item for item in FreqDist if FreqDist[item]== max_value][0]
    
    return    wordfreq, maxfreq


if __name__ == '__main__':
    text = input()
    word = input()
    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

    text = Text(gutenberg.words(text))

    word_freq, max_freq = findWordFreq(text, word)

    print(word_freq)
    print(max_freq)

LAB 2: Welcome to NLP Using Python - Accessing Text Corpora

Question 1: Hands on - NLP Python accessing text corpora.

Solution 1: NLP Python accessing text corpora.

#!/bin/python3

#Write your code here


#!/bin/python3

import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data"
import nltk


#
# Complete the 'accessTextCorpora' function below.
#
# The function accepts following parameters:
#  1. STRING fileid
#  2. STRING word
#
from nltk.corpus import inaugural
def accessTextCorpora(fileid, word):
    # Write your code here
    
    # Task 1:
    # Compute the word coverage for the given 'field' associated with the text corpus 'Inaugural', and store the result into 'wordcoverage'. 
    
    text_to_word_list_convert =  inaugural.words(fileid)    
    wordfreq = nltk.FreqDist(text_to_word_list_convert)
    
    unique_word = wordfreq.items()
    unique_word_length = len(unique_word)    
    total_words = list(wordfreq.elements())
    total_words_length = len(list(wordfreq.elements()))
    
    wordcoverage = int(total_words_length/ unique_word_length)

    # Task 2:
    # Filter the words ending with 'ed' from the set of unique words for the given 'fileid' of 'Inaugural' corpus, and store it into 'ed_words' variable as a list.
    ed_words=   [  word for word in set(total_words) if word.endswith("ed") ]    
    
    # Task 3:
    # Convert all the words into 'lowercase'. Determine the frequency distribution of all the words having only alphabets for the given 'fileid' of 'Inaugural' corpus, and store it into a variable 'textfreq'. Find the frequency for the given 'word', and store it into 'wordfreq'.
    whole_word_lower= [  word.lower() for word in total_words  if word.isalpha()]   
    wordfreq=whole_word_lower.count(word)
    
    # Return wordcoverage, ed_words ,wordfreq
    return wordcoverage, ed_words ,wordfreq

if __name__ == '__main__':
    fileid = input()
    word = input()

    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

    word_coverage, ed_words, word_freq = accessTextCorpora(fileid, word)

    print(word_coverage)
    print(sorted(ed_words))
    print(word_freq)

Question 2: Hands on NLP python user specific text corpora.

Solution 2:

#!/bin/python3

#Write your code here


#!/bin/python3

import math
import os
import random
import re
import sys
import nltk


#
# Complete the 'createUserTextCorpora' function below.
#
# The function accepts following parameters:
#  1. STRING filecontent1
#  2. STRING filecontent2
#
# from nltk.corpus import inaugural
from nltk.corpus import PlaintextCorpusReader
def createUserTextCorpora(filecontent1, filecontent2):
    # Write your code here

    # Task 1:
    # Create a text file name called 'content1.txt', and write content 'filecontent1' inside the nltk_data folder.
    #
    with open(os.path.join('nltk_data/', 'content1.txt'), "w") as f:
        f.write(filecontent1)
        f.close()
    
    # Task 2:
    # Create a text file name called 'content2.txt', and write content 'filecontent2' inside the nltk_data folder.
    #
    with open(os.path.join('nltk_data/', 'content2.txt'), "w") as f2:
        f2.write(filecontent2) 
        f2.close() 
     
    # Task 3:
    # Convert your collection of text files inside the 'nltk_data' folder into a text corpus, store it into the 'text_corpus' variable.
    #
    corpus_root_directory = 'nltk_data/'
    text_corpus = PlaintextCorpusReader(corpus_root_directory,r'.*\.txt')
    
    
    p1= text_corpus.words('content1.txt')
    p2= text_corpus.words('content2.txt')
    
    # Task 4:
    # Compute the number of words and number of unique words of all the file IDs associated with the text corpus,
    # And store into 'no_of_words_corpus1', 'no_of_unique_words_corpus1', 'no_of_words_corpus2', 'no_of_unique_words_corpus2', variable.
    #
    wordfreq = nltk.FreqDist(p1)
    g= list(wordfreq.elements())

    no_of_words_corpus1 = len(g)
    no_of_unique_words_corpus1 =  len(set(g))
    
    wordfreq2 = nltk.FreqDist(p2)
    g2= list(wordfreq2.elements())
    no_of_words_corpus2 =  len(g2)
    no_of_unique_words_corpus2 =  len(set(g2))
    
    return text_corpus, no_of_words_corpus1, no_of_unique_words_corpus1, no_of_words_corpus2, no_of_unique_words_corpus2
        
   

if __name__ == '__main__':
    filecontent1 = input()

    filecontent2 = input()

    path = os.path.join(os.getcwd(), "nltk_data")
    os.makedirs(path, exist_ok=True)
    for file in os.listdir(path):
        os.remove(path+"\\"+file)


    text_corpus, no_of_words_corpus1, no_of_unique_words_corpus1, no_of_words_corpus2, no_of_unique_words_corpus2 = createUserTextCorpora(filecontent1, filecontent2)
    expected_corpus_files = ['content1.txt', 'content2.txt']
    if type(text_corpus) == nltk.corpus.reader.plaintext.PlaintextCorpusReader and sorted(list(text_corpus.fileids())) == expected_corpus_files:
        print(no_of_words_corpus1)
        print(no_of_unique_words_corpus1)
        print(no_of_words_corpus2)
        print(no_of_unique_words_corpus2)

LAB 3: Welcome to NLP Using Python - Conditional Frequency Distribution.

Problem 3: Hands-On: Conditional Frequency.

Solution : Conditional Frequency

#!/bin/python3

#Write your code here


#!/bin/python3

import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd()+"/nltk_data"
import nltk

#
# Complete the 'calculateCFD' function below.
#
# The function accepts following parameters:
#  1. STRING_ARRAY cfdconditions
#  2. STRING_ARRAY cfdevents
#
from nltk.corpus import brown, stopwords
# nltk.download('brown')
def calculateCFD(cfdconditions, cfdevents):
    
    # Task 1:
    # Determine the conditional frequency of all the words (convert into lower case and remove all the stop words) for the given category 'cfdconditions' of the brown corpora. 
    # Store the result in 'cdev_cfd'.    
    cfd = nltk.ConditionalFreqDist([ 
    (genre, word.lower()) 
    for genre in cfdconditions
    for word in brown.words(categories=genre) ])
    
    cfd.tabulate(conditions=cfdconditions, samples=cfdevents)
    
    cdev_cfd= set(stopwords.words('english'))
    
    # Task 2: Determine the words ending with 'ing' or 'ed'. Compute conditional frequency distribution, where the condition is 'cfdconditions', and the event is either 'ing' or 'ed'.
    # Store the conditional frequency distribution in the variable 'inged_cfd'.
    
    temp_genre = [ (genre, word.lower()) for genre in brown.categories() for word in brown.words(categories=genre) if (word.lower().endswith('ing') or word.lower().endswith('ed'))]
    
    generc_word_list= [list(x) for x in temp_genre ]
    
    for wd in generc_word_list:
        if wd[1].endswith('ing') and wd[1] not in cdev_cfd:
            wd[1]= 'ing'
        elif wd[1].endswith('ed') and wd[1] not in cdev_cfd:
            wd[1] = 'ed'
            
    inged_cfd = nltk.ConditionalFreqDist(generc_word_list)
    inged_cfd.tabulate(conditions = cfdconditions, samples=['ed', 'ing'])       
    

if __name__ == '__main__':
    cfdconditions_count = int(input().strip())

    cfdconditions = []

    for _ in range(cfdconditions_count):
        cfdconditions_item = input()
        cfdconditions.append(cfdconditions_item)

    cfdevents_count = int(input().strip())

    cfdevents = []

    for _ in range(cfdevents_count):
        cfdevents_item = input()
        cfdevents.append(cfdevents_item)

    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

    calculateCFD(cfdconditions, cfdevents)

LAB 4: Welcome to NLP Using Python - Processing Raw Text.

Problem 4: Hands on Stemming

Solution:

#!/bin/python3

#Write your code here


#!/bin/python3

import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data"
import nltk

#
# Complete the 'processRawText' function below.
#
# The function accepts STRING textURL as parameter.
#
from urllib import request
def processRawText(textURL):
    # Write your code here
    
    # Task 1: 
    # Read the text content from the given link 'textURL'. Store the content in the variable 'textcontent'.
    #    
    url = textURL
    textcontent = request.urlopen(url).read()
    
    # Task 2:
    # Tokenize all the words in the 'textcontent', and convert them into lower case. Store the tokenized list of words in 'tokenizedlcwords'.
    #
    
    text_content1 = textcontent.decode('unicode_escape')  # Converts bytes to unicode
    tokens1 = nltk.word_tokenize(text_content1)
    wordfreq_m= nltk.FreqDist(tokens1)
    
    tokenizedlcwords =  [  word.lower()  for word in tokens1 ]
    
    # Task 3:
    # Find the number of words in 'tokenizedlcwords', and store the result in  'noofwords'.
    #
    noofwords = len(tokenizedlcwords)
    
    # Task 4:
    # Find the number of unique words in 'tokenizedlcwords', and store the result in  'noofunqwords'.
    #
    noofunqwords = len(set( tokenizedlcwords))   
    
    # Task 5 :
    # Calculate the word coverage of 'tokenizedlcwords' obtained from the number of words and number of unique words, Store the result in the 'wordcov'.
    #
    wordcov =  int( noofwords / noofunqwords)
    
    # Task 6:
    # Determine the frequency distribution of all words having only alphabets in 'tokenizedlcwords'. Store the result in the variable 'wordfreq'.
    wordfreq_list = [  word  for word in tokens1 if word.isalpha() ]
    wordfreq= len(wordfreq_list)

    # Task 7
    # Find the maximum frequent word of 'tokenizedlcwords'. Store the result in the variable 'maxfreq'   
    dict_of_word_frequency = { x: tokenizedlcwords.count(x) for x in tokenizedlcwords }
    maxfreq = max(dict_of_word_frequency, key= dict_of_word_frequency.get)
    
    # print()
    return noofwords, noofunqwords, wordcov,maxfreq

if __name__ == '__main__':
    textURL = input()

    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

    noofwords, noofunqwords, wordcov, maxfreq = processRawText(textURL)
    print(noofwords)
    print(noofunqwords)
    print(wordcov)
    print(maxfreq)

LAB 5: Welcome to NLP Using Python - Bigrams and Collocations.

Problem 5: Hands on Bigrams, ngrams

Solution:

#!/bin/python3

#Write your code here


#!/bin/python3

import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data"

import nltk


#
# Complete the 'performBigramsAndCollocations' function below.
#
# The function accepts following parameters:
#  1. STRING textcontent
#  2. STRING word
#

#
from nltk.corpus import stopwords
def performBigramsAndCollocations(textcontent, word):
    # Write your code here
    
    # Task 1: Tokenize all the words given in 'textcontent'. The word should contain alphabetes or numbers or underscore. Store the tokenized list of words in 'tokenizedwords'
    reg_exp= r'([A-Za-z0-9\_]+)'
    tokenizedwords = nltk.regexp_tokenize(textcontent , pattern= reg_exp)
    
    # Task 2: Convert all the words into lowercase. Store the result in 'tokenizedwords'.
    tokenizedwords = [  word.lower() for word in tokenizedwords ]
    
    # Task 3: Compute bigrams of the list 'tokenizedwords'. Store the list of bigrams in 'tokenizedwordsbigrams'
    tokenizedwordsbigrams = nltk.bigrams(tokenizedwords)
    
    # Task 4:
    # Filter only the bigrams from 'tokenizedwordsbigrams', where the words are not part of 'stopwords'.
    # Store the result in  'tokenizednonstopwordsbigrams'.  
    
    sw = set(stopwords.words('english'))
    temp = list(tokenizedwordsbigrams)

    sw = set(stopwords.words('english'))

    tokenizedwordsbigrams = temp
    # print(tokenizedwordsbigrams)

    tokenizednonstopwordsbigrams = [ j   for j  in tokenizedwordsbigrams  if j[0] not in sw and  j[1] not in sw   ]
    
    # Task 5:
    # Compute the Conditional Frequency of 'tokenizednonstopwordsbigrams', where condition and event refer to the word.
    # Store the result in 'cfd_bigrams'.
    
    cfd_bigrams = nltk.ConditionalFreqDist(tokenizednonstopwordsbigrams)
    
    # Task 6:
    # Determine the three most frrerquent words occuring after the given 'word'.
    # Store the result in 'mostfrequentwordafter'.
    mostfrequentwordafter = cfd_bigrams[word].most_common(3)
    
    # Task 7:
    # Generate collocations from 'tokenizedwords'. Store list of collocations words in 'collocationwords'.
    collocation_list = nltk.Text(tokenizedwords).collocation_list() 
    collocationwords=  [ " ".join([s1,s2]) for s1,s2  in collocation_list  ] 
    
    return  mostfrequentwordafter, collocationwords
    
    

if __name__ == '__main__':
    textcontent = input()

    word = input()

    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

    mostfrequentwordafter, collocationwords = performBigramsAndCollocations(textcontent, word)
    print(sorted(mostfrequentwordafter, key=lambda element: (element[1], element[0]), reverse=True))
    print(sorted(collocationwords))

LAB 6: Welcome to NLP Using Python - Stemming and Lemmatization.

Problem 6: Hands on Stemming and Lemmatization

Solution:

#!/bin/python3

#Write your code here


# LAB 6:
# Welcome to NLP Using Python - Stemming and Lemmatization



#!/bin/python3

import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd()+"/nltk_data"
import nltk

#
# Complete the 'performStemAndLemma' function below.
#
# The function accepts STRING textcontent as parameter.

from nltk.corpus import stopwords
def performStemAndLemma(textcontent):
    # Write your code here
    # Task 1:
    # Task 1: Tokenize all the words given in 'textcontent'. The word should contain alphabetes or numbers or underscore. Store the tokenized list of words in 'tokenizedwords'
    
    reg_exp= r'([A-Za-z0-9\_]+)'
    tokenizedwords = nltk.regexp_tokenize(textcontent , pattern= reg_exp)
    
    # Task 2: Convert all the words into lowercase. Store the result in 'tokenizedwords'.
    tokenizedwords = [  word.lower() for word in set(tokenizedwords) ]
    
    # Task 3:
    # Remove all the stop words from the 'tokenizedwords'. Store the result into the variable 'filteredwords'.
    sw = set(stopwords.words('english'))
    filteredwords = [ word for word in tokenizedwords if (word not in sw)]
    
    # Task 4: 
    # Stem each word present in 'filteredwords' with 'PorterStemmer', and store the resultin the list 'porterstemmedwords'.
    porterstemmedwords =   [  nltk.PorterStemmer().stem(word)  for word in filteredwords ]
    
    # Task 5:
    # Stem each word present in 'filteredwords' with 'LancasterStemmer', and store the resultin the list 'lancasterstemmedwords'.
    lancasterstemmedwords =  [  nltk.LancasterStemmer().stem(word)  for word in filteredwords ]  
    
    # Task 6:
    # Stem each word present in 'filteredwords' with 'WordNetLemmatizer', and store the resultin the list 'lemmatizedwords'.
    lemmatizedwords = [  nltk.WordNetLemmatizer().lemmatize(word)  for word in filteredwords ]
     
    
    return porterstemmedwords, lancasterstemmedwords, lemmatizedwords

if __name__ == '__main__':
    textcontent = input()

    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

    porterstemmedwords, lancasterstemmedwords, lemmatizedwords = performStemAndLemma(textcontent)

    print(sorted(porterstemmedwords))
    print(sorted(lancasterstemmedwords))
    print(sorted(lemmatizedwords))



LAB 7: Welcome to NLP Using Python - POS Tagging

Problem 7: Hands on POS Tagging.

Solution: Hands on POS Tagging.

#!/bin/python3

#Write your code here
import math
import os
import random
import re
import sys
import zipfile
os.environ['NLTK_DATA'] = os.getcwd() + "/nltk_data"
from nltk.corpus import brown
import nltk



#
# Complete the 'tagPOS' function below.
#
# The function accepts following parameters:
#  1. STRING textcontent
#  2. STRING taggedtextcontent
#

def tagPOS(textcontent, taggedtextcontent, defined_tags):
    # Write your code here
    
    # Task 1:
    # Tag the part of speech for the given 'textcontent' words, store the result into the variable 'nltk_pos_tags'.
    words = nltk.word_tokenize(textcontent)    
    nltk_pos_tags= nltk.pos_tag(words)
    
    # Task 2:
    # Tag the part of speech for the given 'taggedtextcontent' words using the 'Tagged Text method'. Store the result into the variable 'tagged_pos_tag'.
    tagged_pos_tag =  [ nltk.tag.str2tuple(word) for word in taggedtextcontent.split() ]
    
    # Task 3:
    # Tag the part of speech for the given 'textcontent' words and use 'defined_tags' as a model in the 'Lookup Tagger method'. Store the result into the variable 'unigram_pos_tag'.
    #
    words = nltk.word_tokenize(textcontent)
    baseline_tagger = nltk.UnigramTagger(model=defined_tags)
    
    unigram_pos_tag = baseline_tagger.tag(words) 
    
    return nltk_pos_tags, tagged_pos_tag, unigram_pos_tag
    

if __name__ == '__main__':
    textcontent = input()

    taggedtextcontent = input()
    
    if not os.path.exists(os.getcwd() + "/nltk_data"):
        with zipfile.ZipFile("nltk_data.zip", 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

    defined_tags = dict(brown.tagged_words(tagset='universal'))

    nltk_pos_tags, tagged_pos_tag, unigram_pos_tag = tagPOS(textcontent, taggedtextcontent, defined_tags)

    print(nltk_pos_tags)
    print(tagged_pos_tag)
    print(unigram_pos_tag)

File_1675368309025
  
  

About the author

D Shwari
I'm a professor at National University's Department of Computer Science. My main streams are data science and data analysis. Project management for many computer science-related sectors. Next working project on Al with deep Learning.....

Post a Comment