Академический Документы
Профессиональный Документы
Культура Документы
Trabalho Futuros
Uma das tarefas futuras seria a aplicação da tarefa de NER
nas notı́cias para extrair as entidades (pessoas e empresas)
mencionadas nos textos. Desta forma, seria possı́vel diferen-
ciar notı́cias que tenham altos ı́ndices de similaridade, mas
que falam de pessoas ou empresas diferentes.
Apêndice A - Crawler das Notı́cias
@abstractmethod
def parse(self, response):
pass
@abstractmethod
def parse_details(self, response):
pass
for t in element.xpath(
'.//p//text() | .//blockquote//text() | .//h1//text() | .//h2//text() | .//h3//
text() | .//h4//text() | .//h5//text() | .//h6//text() | .//div//text()').extract():
if (t):
text = text + t + ' '
formated_text = self.removeWhiteSpaces(text)
if noticia['autor']!=None:
noticia['autor'] = self.removeWhiteSpaces(noticia['autor'])
if noticia['descricao']!=None:
noticia['descricao'] = self.removeWhiteSpaces(noticia['descricao'])
if noticia['texto'] != None:
noticia['texto'] = self.removeWhiteSpaces(noticia['texto'])
else:
# If text has not already previously filled
noticia['texto']=formated_text
if noticia['titulo'] != None:
noticia['titulo'] = self.removeWhiteSpaces(noticia['titulo'])
else:
# If title has not already previously filled
noticia['titulo'] = formated_title
noticia['url']=page_url
Apêndice A - Crawler das Notı́cias
#p1 = reg()
#noticia['reg_petro'] = p1.match_texto_petrobras(formated_text) != None
#noticia['reg_empresas_lavajato'] = p1.match_texto_empresas_lavajato(formated_text) != None
#noticia['reg_corrupcao'] = p1.match_texto_corrupcao(formated_text) != None
#noticia['reg_dinheiro'] = p1.match_texto_reg_dinheiro(formated_text) != None
self.crawled_count += 1
self.logger.info('News Count %s : %s' % (self.crawled_count, page_url))
pdf = Pdf(veiculo=self.name,
url_mae=page_url,
url=pdf_url)
self.pdf_count += 1
self.logger.info('PDF Count %s : %s' % (self.pdf_count, pdf_url))
yield scrapy.Request(pdf_url, callback=self.save_pdf, meta={'pdf':pdf})
else:
if (self.domain in page_url) and (not self.urlExists(page_url)):
yield scrapy.Request(page_url, callback=self.parse_noticia)
if (not self.urlExists(page_url)):
yield noticia
if response.url.endswith('.pdf'):
file = self.project_path + 'pdf/' + self.name + '/' + re.sub('[^\\w|\\s]', '_',
response.url).replace('_pdf',
'.pdf')
else:
file = self.project_path + 'pdf/' + self.name + '/' + re.sub('[^\\w|\\s]', '_',
response.url) + '.pdf'
try:
if (not path.isfile(file)):
with open(file, 'wb') as f:
f.write(response.body)
except Exception as ex:
self.logger.error(ex)
self.df_crawled = self.df_crawled.append({'url': response.url}, ignore_index=True)
if (not self.urlExists(response.url)):
yield pdf
Apêndice A - Crawler das Notı́cias
import scrapy
class Noticia(scrapy.Item):
data = scrapy.Field()
veiculo = scrapy.Field()
descricao = scrapy.Field()
texto = scrapy.Field()
autor = scrapy.Field()
titulo = scrapy.Field()
url = scrapy.Field()
#reg_petro = scrapy.Field()
#reg_empresas_lavajato = scrapy.Field()
#reg_corrupcao = scrapy.Field()
#reg_dinheiro = scrapy.Field()
class Pdf(scrapy.Item):
veiculo = scrapy.Field()
url_mae = scrapy.Field()
url = scrapy.Field()
Apêndice A - Crawler das Notı́cias
class G1Spider(ApdSpider):
name = 'g1'
sites = [{'nome': 'Blog da Julia Duailibi',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/7f8f0359-
e9d7-42e2-add0-b41d82b138f8/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False},
{'nome': 'Blog da Cristiana Lobo',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/
21a52bf6-286b-4094-9384-5beffa8806e6/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False},
{'nome': 'Blog do Camarotti',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/
edb56541-7a67-4e0e-85b9-bfa305d3d11a/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False},
{'nome': 'Blog do Matheus Leitao',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/
a824ae77-9930-44ca-b665-37f118648436/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False},
{'nome': 'Operacao Lava-Jato',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/
f237164d-7855-4714-b2d8-bf535da06bf3/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False},
{'nome': 'Blog do Valdo Cruz',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/
6cdf9cc8-73e8-4d4b-95f8-2f5bf3688f9a/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False},
{'nome': 'Blog da Andreia Sadi',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/
384d9047-117e-4d94-b225-1849e4b6201f/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False},
{'nome': 'G1 Politica',
'base_url': 'https://falkor-cda.bastian.globo.com/tenants/g1/instances/
1b9deafa-9519-48a2-af13-5db036018bad/posts/page/%s',
'count': 1,
'crawled_count': 0,
'done': False}]
domain = 'g1.globo.com/politica/'
'''
custom_settings = {
'LOG_FILE': crawler.settings.PROJECT_ROOT + "/log/" + name + "/" + name + "_log.txt"
}
'''
def start_requests(self):
for site in self.sites:
yield scrapy.Request(site['base_url'] % site['count'], meta={'site': site},
callback=self.parse)
1. descricao
2. data
3. autor
"""
# Lê o JSON do corpo da resposta
jsonResponse = json.loads(response.body_as_unicode())
items = jsonResponse['items']
site = response.meta.get('site')
if len(items) > 0:
for item in items:
url = item['content']['url']
if (url != None) and (self.domain in url):
if (not self.urlExists(url)):
yield scrapy.Request(url, callback=self.parse_details, meta={'item': item})
else:
site['crawled_count'] += 1
site['done'] = site['crawled_count'] > self.NUM_CRAWLED
else:
site['done'] = True
site['count'] += 1
if not self.allDone() and (not site['done']):
yield scrapy.Request(site['base_url'] % site['count'], meta={'site': site},
callback=self.parse)
def allDone(self):
for site in self.sites:
if not site['done']:
return False
return True
Apêndice B - Código para Embeddings e Similaridade
import pandas as pd
import gensim
import numpy as np
import re
from dateutil import parser
from datetime import datetime, timedelta
from allennlp.commands.elmo import ElmoEmbedder
from sklearn.metrics.pairwise import cosine_similarity
# Carregando os dados
df_g1 = pd.read_csv('data/g1_match.csv', sep='|', index_col=0)
df_g1_sim = pd.DataFrame(columns = ['index1', 'index2','url1', 'url2', 'data1', 'data2', 'fasttext',
'elmo'])
# Função que retorna o embedding FastText representando a média dos embeddings de um texto
def calculate_fasttext_mean_vector(model, words):
words = [word for word in words if word in model.vocab]
if len(words) >= 1:
return np.mean(model[words], axis=0)
else:
return []
# Função que retorna o embedding ELMo representando a média dos embeddings de um texto
def calculate_elmo_mean_vector(model, document):
if len(document) >= 1:
embeddings = model.embed_sentence(document)
mean = np.mean(embeddings[2],axis=0)
return mean
#documents = []
for index1, row1 in df_g1.iterrows():
date = row1['data']
if (date):
date = parser.parse(date)
if date >= limit_date:
documents.append(str(row1['texto']).split())
df_g1_petro = df_g1[df_g1['match_empresas_grupo']==True]
df_g1_petro = df_g1_petro[df_g1_petro['match_fraude']==True]
df_g1_lavajato = df_g1[df_g1['match_empresas_lavajato']==True]
df_g1_lavajato = df_g1_lavajato[df_g1_lavajato['match_fraude']==True]
df_filtrado['data'] = pd.to_datetime(df_filtrado['data'])
mask = (df_filtrado['data']>= limit_date)
df_filtrado = df_filtrado.loc[mask]
df_filtrado.reset_index(inplace=True)
cnt = 1
total = len(df_filtrado['texto'])
# Calculando as similaridades
print('Calculando as similaridades')
i = 1
limit_date = parser.parse('01/01/2019')
for index1, row1 in df_filtrado.iterrows():
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(str(i))
for index2, row2 in df_filtrado.iterrows():
url1 = row1['url']
url2 = row2['url']
data1 = row1['data']
data2 = row2['data']
data_ant = row1['data'] - timedelta(days=1)
data_post = row1['data'] + timedelta(days=1)
if (url1 != url2) and (row2['data'] >= data_ant) and (row2['data'] <= data_post):
doc1 = row1['texto']
doc2 = row2['texto']
if doc1 and doc2:
fasttext = fasttext_similarity[index1][index2]
elmo = elmo_similarity[index1][index2]
df_g1_sim = df_g1_sim.append({'index1':index1, 'index2':index2, 'url1': url1, 'url2':
url2, 'data1' : data1, 'data2' : data2, 'fasttext': fasttext, 'elmo':elmo}, ignore_index=True)
i += 1
# Salvando os resultados
print('Salvando os resultados')
df_filtrado.to_csv('result/g1_filtered.csv')
df_g1_sim.to_csv('result/g1_result.csv')