Вы находитесь на странице: 1из 4

# -*- coding: utf-8 -*-

"""
Created on Thu Mar 9 22:26:54 2017
@author: Virat Bhandari
"""
import urllib
from bs4 import BeautifulSoup
import re
url = "https://www.bloomberg.com/politics/articles/2017-03-09/u-k-s-johnson-says
-vast-brexit-bill-would-be-not-reasonable"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text)
###################################Text Blob
from textblob import TextBlob
opinion = TextBlob(text)
opinion.sentiment
opinion.tags
###############################################3
##Removing Noise########################
noise_list = ["Bloomberg","to","for","will","be","for","and","of","the","our","i
n","is", "a", "this","that","we",",","\xa0","#","/",".","+","___________________
_______","/n"]
def _remove_noise(input_text):
words = input_text.split()
noise_free_words = [word for word in words if word not in noise_list]
noise_free_text = " ".join(noise_free_words)
return noise_free_text
updated_text = _remove_noise(text)
##Remove via Regex
def _remove_regex(updated_text, regex_pattern):
urls = re.finditer(regex_pattern, updated_text)
for i in urls:
updated_text = re.sub(i.group().strip(), '', updated_text)
return updated_text
regex_pattern = "#[\w]*"
_remove_regex(updated_text, regex_pattern)
cleaned = " ".join(re.findall('[A-Z][^A-Z]*', updated_text))
#####Stemming and Lemmatization
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
updated_text = lem.lemmatize(updated_text)
from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()
updated_text= stem.stem(updated_text)

#from textblob import Word


#w= Word(updated_text)
#w_lem = w.lemmatize()
#w_lem=TextBlob(w_lem)
##POS Tagging
from nltk import word_tokenize, pos_tag
tokens = word_tokenize(updated_text)
print (pos_tag(tokens))
#####
import gensim
import gensim.corpora as cp

# Creating the term dictionary of our corpus, where every unique term is assigne
d an index.
dictionary = cp.Dictionary([updated_text.split()])
# Converting list of documents (corpus) into Document Term Matrix using dictiona
ry prepared above.
doc_term_matrix = [dictionary.doc2bow(word) for word in [updated_text.split()]]
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
# Results
print(ldamodel.print_topics())
#Creating N GRAMS here N = 2
def generate_ngrams(text, n):
words = text.split()
output = []
for i in range(len(words)-n+1):
output.append(words[i:i+n])
return output
generate_ngrams(updated_text, 2)
# Statistics
# TF - IDF
from sklearn.feature_extraction.text import TfidfVectorizer
obj = TfidfVectorizer()
corpus = updated_text.split()
X = obj.fit_transform(corpus)
print (X)
# TEXT STATISTICS
from textstat.textstat import textstat
if __name__ == '__main__':
test_data = updated_text
print (textstat.flesch_reading_ease(test_data))
print (textstat.smog_index(test_data))
print (textstat.flesch_kincaid_grade(test_data))
print (textstat.coleman_liau_index(test_data))
print (textstat.automated_readability_index(test_data))
print (textstat.dale_chall_readability_score(test_data))
print (textstat.difficult_words(test_data))
print (textstat.linsear_write_formula(test_data))
print (textstat.gunning_fog(test_data))
print (textstat.text_standard(test_data))
#Word Embedding
from gensim.models import Word2Vec
#sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],['
machine', 'learning'], ['deep', 'learning']]
# train the model on your corpus
sent = [corpus.tokenize(s) for s in corpus]
model = Word2Vec(updated_text.split(), min_count = 10)
print (model.similarity('Brexit', 'EU'))

##Machine Learning
##Here is a code that uses naive bayes classifier using text blob library (built
on top of nltk).
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob
training_corpus = [
('Brexit EU London Borris.', 'Politics'),
("Theresa May is doing bad politics", 'Politics'),
('He is my badest enemy!', 'Children'),
('My management is poor.', 'Managers'),
('I love this burger.', 'Children'),
('This is an brilliant place!', 'Tourism'),
('I feel very good about these dates.', 'Tourism'),
('This is my best visit to any country.', 'Politics'),
("What an awesome view", 'Tourism'),
('I do not like this dish', 'Children')]
test_corpus = [
("Brexit is not going to be good", 'Politics'),
("I feel brilliant!", 'Tourism'),
('Gary is a friend of mine.', 'Children'),
("I can't believe I'm doing this, Borris.", 'Politics'),
('The date was good.', 'Tourism'),
('I do not enjoy London', 'Politics')]
model = NBC(training_corpus)
print(model.classify("Brexit Borris"))
print(model.classify("badest management poor burger"))
print(model.classify("Brilliant view feel very good"))

## Text Matching / Similarity


def levenshtein(s1,s2):
if len(s1) > len(s2):
s1,s2 = s2,s1
distances = range(len(s1) + 1)
for index2,char2 in enumerate(s2):
newDistances = [index2+1]
for index1,char1 in enumerate(s1):
if char1 == char2:
newDistances.append(distances[index1])
else:
newDistances.append(1 + min((distances[index1], distances[index
1+1], newDistances[-1])))
distances = newDistances
return distances[-1]
print(levenshtein("analyze","analyse"))

## Cosin Similarity
## By this we can check if any 2 articles are same or not
import math
from collections import Counter
def get_cosine(vec1, vec2):
common = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in common])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = text.split()
return Counter(words)
text1 = updated_text
text2 = 'article on analytics vidhya is about natural language processing'
vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)
cosine = get_cosine(vector1, vector2)