Auto

##############################################################################
########
# This example is pretty much entirely based on this excellent blog post
# http://glowingpython.blogspot.in/2014/09/text-summarization-with-nltk.html
# Thanks to TheGlowingPython, the good soul that wrote this excellent article!
# That blog is is really interesting btw.
##############################################################################
########
##############################################################################
########
# nltk = "natural language toolkit" is a python library with support for
# natural language processing. Super-handy.
# Specifically, we will use 2 functions from nltk
# sent_tokenize: given a group of text, tokenize (split) it into sentences
# word_tokenize: given a group of text, tokenize (split) it into words
# stopwords.words('english') to find and ignored very common words ('I', 'the',...)
# to use stopwords, you need to have run nltk.download() first - one-off setup
##############################################################################
########
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
##############################################################################
########
# We have use dictionaries so far, but now that we have covered classes - this is a good
# time to introduce defaultdict. THis is class that inherits from dictionary, but has
# one additional nice feature: Usually, a Python dictionary throws a KeyError if you try
# to get an item with a key that is not currently in the dictionary.
# The defaultdict in contrast will simply create any items that you try to access
# (provided of course they do not exist yet). To create such a "default" item, it relies
# a function that is passed in..more below.
##############################################################################
########
from collections import defaultdict
##############################################################################
########
# punctuation to ignore punctuation symbols
##############################################################################
########
from string import punctuation
##############################################################################
########
# heapq.nlargest is a function that given a list, easily and quickly returns
# the 'n' largest elements in the list. More below
##############################################################################
########
from heapq import nlargest

# Our first class, named FrequencySummarizer
##############################################################################
########
class FrequencySummarizer:
# indentation changes - we are now inside the class definition
def __init__(self, min_cut=0.1, max_cut=0.9):
# The constructor named __init__
# THis function will be called each time an object of this class is
# instantiated
# btw, note how the special keyword 'self' is passed in as the first
# argument to each method (member function).
self._min_cut = min_cut
self._max_cut = max_cut
# Words that have a frequency term lower than min_cut
# or higer than max_cut will be ignored.
self._stopwords = set(stopwords.words('english') + list(punctuation))
# Punctuation symbols and stopwords (common words like 'an','the' etc) are ignored
# Here self._min_cut, self._max_cut and self._stopwords are all member variables
# i.e. each object (instance) of this class will have an independent version of these
# variables.
# Note how this function is used to set up the member variables to their appropriate values
# indentation changes - we are out of the constructor (member function, but we are still
inside)
# the class.
# One important note: if you are used to programming in Java or C#: if you define a variable
here
# i.e. outside a member function but inside the class - it becomes a STATIC member variable
# THis is an important difference from Java, C# (where all member variables would be defined
here)
# and is a common gotcha to be avoided.
def _compute_frequencies(self, word_sent):
# next method (member function) which takes in self (the special keyword for this same
object)
# as well as a list of sentences, and outputs a dictionary, where the keys are words, and
# values are the frequencies of those words in the set of sentences
freq = defaultdict(int)
# defaultdict, which we referred to above - is a class that inherits from dictionary,
# with one difference: Usually, a Python dictionary throws a KeyError if you try
# to get an item with a key that is not currently in the dictionary.
# The defaultdict in contrast will simply create any items that you try to access
# (provided of course they do not exist yet). THe 'int' passed in as argument tells
# the defaultdict object to create a default value of 0
for s in word_sent:
# indentation changes - we are inside the for loop, for each sentence
for word in s:
# indentation changes again - this is an inner for loop, once per each word_sent
# in that sentence
if word not in self._stopwords:
# if the word is in the member variable (dictionary) self._stopwords, then ignore it,
# else increment the frequency. Had the dictionary freq been a regular dictionary (not a
# defaultdict, we would have had to first check whether this word is in the dict
freq[word] += 1
# Done with the frequency calculation - now go through our frequency list and do 2 things
# normalize the frequencies by dividing each by the highest frequency (this allows us to
# always have frequencies between 0 and 1, which makes comparing them easy
# filter out frequencies that are too high or too low. A trick that yields better results.
m = float(max(freq.values()))
# get the highest frequency of any word in the list of words
for w in list(freq.keys()):
# indentation changes - we are inside the for loop
freq[w] = freq[w]/m
# divide each frequency by that max value, so it is now between 0 and 1
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
# indentation changes - we are inside the if statement - if we are here the word is either
# really common or really uncommon. In either case - delete it from our dictionary
del freq[w]
# remember that del can be used to remove a key-value pair from the dictionary
return freq
# return the frequency list
def summarize(self, text, n):
# next method (member function) which takes in self (the special keyword for this same
object)
# as well as the raw text, and the number of sentences we wish the summary to contain.
Return the
# summary
sents = sent_tokenize(text)
# split the text into sentences
assert n <= len(sents)
# assert is a way of making sure a condition holds true, else an exception is thrown. Used to
do
# sanity checks like making sure the summary is shorter than the original article.
word_sent = [word_tokenize(s.lower()) for s in sents]
# This 1 sentence does a lot: it converts each sentence to lower-case, then
# splits each sentence into words, then takes all of those lists (1 per sentence)
# and mushes them into 1 big list
self._freq = self._compute_frequencies(word_sent)
# make a call to the method (member function) _compute_frequencies, and places that in
# the member variable _freq.
ranking = defaultdict(int)
# create an empty dictionary (of the superior defaultdict variety) to hold the rankings of the
# sentences.
for i,sent in enumerate(word_sent):
# Indentation changes - we are inside the for loop. Oh! and this is a different type of for
loop
# A new built-in function, enumerate(), will make certain loops a bit clearer.
enumerate(sequence),
# will return (0, thing[0]), (1, thing[1]), (2, thing[2]), and so forth.
# A common idiom to change every element of a list looks like this:
# for i in range(len(L)):
# item = L[i]
# ... compute some result based on item ...
# L[i] = result
# This can be rewritten using enumerate() as:
# for i, item in enumerate(L):
# ... compute some result based on item ...
# L[i] = result
for w in sent:
# for each word in this sentence
if w in self._freq:
# if this is not a stopword (common word), add the frequency of that word
# to the weightage assigned to that sentence
ranking[i] += self._freq[w]
# OK - we are outside the for loop and now have rankings for all the sentences
sents_idx = nlargest(n, ranking, key=ranking.get)
# we want to return the first n sentences with highest ranking, use the nlargest function to
do so
# this function needs to know how to get the list of values to rank, so give it a function -
simply the
# get method of the dictionary
return [sents[j] for j in sents_idx]
# return a list with these values in a list
# Indentation changes - we are done with our FrequencySummarizer class!
##############################################################################
########
# Now to get a URL and summarize
##############################################################################
########
import urllib.request
from bs4 import BeautifulSoup
##############################################################################
########
# Introducing Beautiful Soup: " Beautiful Soup is a Python library for pulling data out of
# HTML and XML files. It works with your favorite parser to provide idiomatic ways of
# navigating, searching, and modifying the parse tree. It commonly saves programmers hours
# or days of work.
##############################################################################
########
def get_only_text_washington_post_url(url):
# This function takes in a URL as an argument, and returns only the text of the article in that
URL.
page = urllib.request.urlopen(url).read().decode('utf8')
# download the URL
soup = BeautifulSoup(page)
# initialise a BeautifulSoup object with the text of that URL
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
# use this code to get everything in that text that lies between a pair of
# <article> and </article> tags. We do this because we know that the URLs we are currently
# interested in - those from the WashingtonPost have this nice property
# OK - we got everything between the <article> and </article> tags, but that everything
# includes a bunch of other stuff we don't want
# Now - repeat, but this time we will only take what lies between <p> and </p> tags
# these are HTML tags for "paragraph" i.e. this should give us the actual text of the article
soup2 = BeautifulSoup(text)
text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
# use this code to get everything in that text that lies between a pair of
# <p> and </p> tags. We do this because we know that the URLs we are currently
# interested in - those from the WashingtonPost have this nice property
return soup.title.text, text
# Return a pair of values (article title, article body)
# Btw note that BeautifulSoup return the title without our doing anything special -
# this is why BeautifulSoup works so much better than say regular expressions at parsing HTML
##############################################################################
#######
# OK! Now lets give this code a spin
##############################################################################
#######
someUrl = "https://www.washingtonpost.com/news/the-switch/wp/2015/08/06/why-kids-are-
meeting-more-strangers-online-than-ever-before/"
# the article we would like to summarize
textOfUrl = get_only_text_washington_post_url(someUrl)
# get the title and text
fs = FrequencySummarizer()
# instantiate our FrequencySummarizer class and get an object of this class
summary = fs.summarize(textOfUrl[1], 3)
# get a summary of this article that is 3 sentences long
print(summary)

Auto

Загружено:

Сведения о документе

Авторское право

Доступные форматы

Поделиться этим документом

Поделиться или встроить документ

Параметры публикации

Этот документ был вам полезен?

Это неприемлемый материал?

Авторское право:

Доступные форматы

Auto

Загружено:

Авторское право:

Доступные форматы

##############################################################################

# That blog is is really interesting btw.

# nltk = "natural language toolkit" is a python library with support for

# natural language processing. Super-handy.

# Specifically, we will use 2 functions from nltk

# sent_tokenize: given a group of text, tokenize (split) it into sentences

# word_tokenize: given a group of text, tokenize (split) it into words

# stopwords.words('english') to find and ignored very common words ('I', 'the',...)

from nltk.tokenize import sent_tokenize,word_tokenize

from nltk.corpus import stopwords

# to get an item with a key that is not currently in the dictionary.

# a function that is passed in..more below.

from collections import defaultdict

# punctuation to ignore punctuation symbols

from string import punctuation

# heapq.nlargest is a function that given a list, easily and quickly returns

# the 'n' largest elements in the list. More below

from heapq import nlargest

# indentation changes - we are now inside the class definition

def __init__(self, min_cut=0.1, max_cut=0.9):

# The constructor named __init__

# THis function will be called each time an object of this class is

# argument to each method (member function).

# Words that have a frequency term lower than min_cut

# or higer than max_cut will be ignored.

self._stopwords = set(stopwords.words('english') + list(punctuation))

# Here self._min_cut, self._max_cut and self._stopwords are all member variables

# and is a common gotcha to be avoided.

def _compute_frequencies(self, word_sent):

# values are the frequencies of those words in the set of sentences

# defaultdict, which we referred to above - is a class that inherits from dictionary,

# to get an item with a key that is not currently in the dictionary.

# the defaultdict object to create a default value of 0

if word not in self._stopwords:

# get the highest frequency of any word in the list of words

# indentation changes - we are inside the for loop

# divide each frequency by that max value, so it is now between 0 and 1

if freq[w] >= self._max_cut or freq[w] <= self._min_cut:

# return the frequency list

def summarize(self, text, n):

# split the text into sentences

assert n <= len(sents)

word_sent = [word_tokenize(s.lower()) for s in sents]

# This 1 sentence does a lot: it converts each sentence to lower-case, then

# and mushes them into 1 big list

# the member variable _freq.

for i,sent in enumerate(word_sent):

# A common idiom to change every element of a list looks like this:

# ... compute some result based on item ...

# for i, item in enumerate(L):

# ... compute some result based on item ...

# for each word in this sentence

# to the weightage assigned to that sentence

sents_idx = nlargest(n, ranking, key=ranking.get)

# get method of the dictionary

return [sents[j] for j in sents_idx]

# return a list with these values in a list

# Indentation changes - we are done with our FrequencySummarizer class!

# Now to get a URL and summarize

# download the URL

# initialise a BeautifulSoup object with the text of that URL

text = ' '.join(map(lambda p: p.text, soup.find_all('article')))

# interested in - those from the WashingtonPost have this nice property

# includes a bunch of other stuff we don't want

def init(self, min_cut=0.1, max_cut=0.9):

# The constructor named init