Вы находитесь на странице: 1из 9

Step 1: Create a csv file

Step 2 : Install and load the required packages

# for text mining


install.packages("tm")
# for text stemming
install.packages("SnowballC")
# for word-cloud generator
install.packages("wordcloud")
# for colour palettes
install.packages("RColorBrewer")

# Load
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")

Step 3 : Text mining

load the text

text <- readLines(file.choose())

2. Load the data as a corpus

docs <- Corpus(VectorSource(text))

3. Inspect the content of the document

inspect(docs)
Build a term-document matrix

dtm <- TermDocumentMatrix(docs)


m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

Text transformation

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))


docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")

Cleaning the text

# Convert the text to lower case


docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word # specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
docs <- tm_map(docs, stemDocument)

Step 4 : Build a term-document matrix

dtm <- TermDocumentMatrix(docs)


m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

Step 5 : Generate the Word cloud

set.seed(1234)

wordcloud(words = d$word, freq = d$freq, min.freq = 1,

max.words=200, random.order=FALSE, rot.per=0.35,

colors=brewer.pal(8, "Dark2"))

Explore frequent terms and their associations

findFreqTerms(dtm, lowfreq = 4)

findAssocs(dtm, terms = "app", corlimit = 0.3)


Plot word frequencies

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word, col ="lightblue", main ="Most


frequent words", ylab = "Word frequencies")

Read the text file from internet


filePath <- "http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-
dream-speech.txt"
text <- readLines(filePath)
How to fetch Data from Tweeter?

Go to
https://developer.twitter.com/en/apps

Create your API

consumerKey="RsB4yZ4VkWSGm8ENNFT0b3TjS"
consumerSecret="FhkzngdQAXlRrxGzpnlix0vnHYpEEk1rWJDSZQxqHjG0knQMDp"
accessToken="707517769739214848-ViNwLenOczx1PRl8rjQJb8Epl808JJy"
accessTokenSecret="IEXcppF7iLMqXISguvAbh5q9VTsXGb9pTIAcfaIe8uFpE"
setup_twitter_oauth(consumerKey,consumerSecret,accessToken,accessTokenSecret)
tweets= searchTwitter("dearicaipleasechange", n=1000, lang="en")

How to save the tweets?


tweet.df <- twListToDF(tweets)
write.csv(tweet.df, "tweets1.csv")
getwd()
Sentiment Analysis
install.packages("SentimentAnalysis")
library(SentimentAnalysis)

# Analyze a single string to obtain a binary response (positive / negative)


sentiment <- analyzeSentiment("Yeah, this was a great soccer game for the German team!")
convertToBinaryResponse(sentiment)$SentimentQDAP

# Create a vector of strings


documents <- c("Wow, I really like the new light sabers!",
"That book was excellent.",
"R is a fantastic language.",
"The service in this restaurant was miserable.",
"This is neither positive or negative.",
"The waiter forget about my dessert -- what poor service!")

sentiment <- analyzeSentiment(documents)


sentiment$SentimentQDAP

# View sentiment direction (i.e. positive, neutral and negative)

convertToDirection(sentiment$SentimentQDAP)
For available documents
sentiment <- analyzeSentiment(documents)
For library documents
library(tm)
data("crude")

# Analyze sentiment
sentiment <- analyzeSentiment(crude)

# Count positive and negative news releases


table(convertToBinaryResponse(sentiment$SentimentLM))
#News releases with highest and lowest sentiment
crude[[which.max(sentiment$SentimentLM)]]$meta$heading
crude[[which.min(sentiment$SentimentLM)]]$meta$heading
# Visualize distribution of standardized sentiment variable
hist(scale(sentiment$SentimentLM))
# Compute cross-correlation
cor(sentiment[, c("SentimentLM", "SentimentHE", "SentimentQDAP")])
# crude oil news between 1987-02-26 until 1987-03-02
datetime <- do.call(c, lapply(crude, function(x) x$meta$datetimestamp))
plotSentiment(sentiment$SentimentLM)
Twitter Sentiment Analysis

# Install Requried Packages


installed.packages("SnowballC")
installed.packages("tm")
installed.packages("twitteR")
installed.packages("syuzhet")

tweets.df <- twListToDF(tweets)


head(tweets.df)

head(tweets.df$text)
#Remove URLs, hashtags and other twitter handles
tweets.df2 <- gsub("http.*","",tweets.df$text)
tweets.df2 <- gsub("https.*","",tweets.df2)
tweets.df2 <- gsub("#.*","",tweets.df2)
tweets.df2 <- gsub("@.*","",tweets.df2)

We will first try to get the emotion score for each of the tweets. ‘Syuzhet’ breaks the
emotion into 10 different emotions – anger, anticipation, disgust, fear, joy, sadness,
surprise, trust, negative and positive.
word.df <- as.vector(tweets.df2)
emotion.df <- get_nrc_sentiment(word.df)
emotion.df2 <- cbind(tweets.df2, emotion.df)
head(emotion.df2)
Most Positive Comment

sent.value <- get_sentiment(word.df)

most.positive <- word.df[sent.value == max(sent.value)]

most.positive

most.negative <- word.df[sent.value <= min(sent.value)]


most.negative

Sentiment score for each tweets


sent.value
Segregating positive and negative tweets
positive.tweets <- word.df[sent.value > 0]
head(positive.tweets)

negative.tweets <- word.df[sent.value < 0]


head(negative.tweets)

neutral.tweets <- word.df[sent.value == 0]


head(neutral.tweets)

Вам также может понравиться