Overview

The goal of clevercloudr is to to serve as a solution to create meaningful word clouds. This library will help data scientists and data analysts clean the data easily by providing functions to clean raw text data, conduct stemming and customize stopwords.

Load the library

library(clevercloudr)

Data

The first function CleverClean() takes in a list of strings as an input.

text <- list("grounds!!!", "feet6", "running123", "feeding", "feed", "feed$", "grounding", "feet", "happiness")
text
#> [[1]]
#> [1] "grounds!!!"
#> 
#> [[2]]
#> [1] "feet6"
#> 
#> [[3]]
#> [1] "running123"
#> 
#> [[4]]
#> [1] "feeding"
#> 
#> [[5]]
#> [1] "feed"
#> 
#> [[6]]
#> [1] "feed$"
#> 
#> [[7]]
#> [1] "grounding"
#> 
#> [[8]]
#> [1] "feet"
#> 
#> [[9]]
#> [1] "happiness"

Clean text data with CleverClean()

CleverClean() takes in a list of strings as an input. It removes digits and puncations in the strings and returns a character vector.

clean_text <- CleverClean(text)
clean_text
#> [1] "grounds"   "feet"      "running"   "feeding"   "feed"      "feed"     
#> [7] "grounding" "feet"      "happiness"

Perform stemming with CleverStemmer()

CleverStemmer() takes in a character vector or a string as an input. It performs stemming on each element of the character vector or each word in the string.

stem_text <- CleverStemmer(clean_text)
stem_text
#> [1] "ground" "feet"   "run"    "feed"   "feed"   "feed"   "ground" "feet"  
#> [9] "happi"

Add customized English stopwords with CleverStopwords()

CleverStopwords() takes a list of strings as an input. It will add each string in the input list to a list of most common English stopwords.

new_words <- list("happi")
new_stopwords <- CleverStopwords(new_words)
new_stopwords
#> [[1]]
#> [1] "a"
#> 
#> [[2]]
#> [1] "about"
#> 
#> [[3]]
#> [1] "above"
#> 
#> [[4]]
#> [1] "after"
#> 
#> [[5]]
#> [1] "again"
#> 
#> [[6]]
#> [1] "against"
#> 
#> [[7]]
#> [1] "all"
#> 
#> [[8]]
#> [1] "am"
#> 
#> [[9]]
#> [1] "an"
#> 
#> [[10]]
#> [1] "and"
#> 
#> [[11]]
#> [1] "any"
#> 
#> [[12]]
#> [1] "are"
#> 
#> [[13]]
#> [1] "aren't"
#> 
#> [[14]]
#> [1] "as"
#> 
#> [[15]]
#> [1] "at"
#> 
#> [[16]]
#> [1] "be"
#> 
#> [[17]]
#> [1] "because"
#> 
#> [[18]]
#> [1] "been"
#> 
#> [[19]]
#> [1] "before"
#> 
#> [[20]]
#> [1] "being"
#> 
#> [[21]]
#> [1] "below"
#> 
#> [[22]]
#> [1] "between"
#> 
#> [[23]]
#> [1] "both"
#> 
#> [[24]]
#> [1] "but"
#> 
#> [[25]]
#> [1] "by"
#> 
#> [[26]]
#> [1] "can't"
#> 
#> [[27]]
#> [1] "cannot"
#> 
#> [[28]]
#> [1] "could"
#> 
#> [[29]]
#> [1] "couldn't"
#> 
#> [[30]]
#> [1] "did"
#> 
#> [[31]]
#> [1] "didn't"
#> 
#> [[32]]
#> [1] "do"
#> 
#> [[33]]
#> [1] "does"
#> 
#> [[34]]
#> [1] "doesn't"
#> 
#> [[35]]
#> [1] "doing"
#> 
#> [[36]]
#> [1] "don't"
#> 
#> [[37]]
#> [1] "down"
#> 
#> [[38]]
#> [1] "during"
#> 
#> [[39]]
#> [1] "each"
#> 
#> [[40]]
#> [1] "few"
#> 
#> [[41]]
#> [1] "for"
#> 
#> [[42]]
#> [1] "from"
#> 
#> [[43]]
#> [1] "further"
#> 
#> [[44]]
#> [1] "had"
#> 
#> [[45]]
#> [1] "hadn't"
#> 
#> [[46]]
#> [1] "happi"
#> 
#> [[47]]
#> [1] "has"
#> 
#> [[48]]
#> [1] "hasn't"
#> 
#> [[49]]
#> [1] "have"
#> 
#> [[50]]
#> [1] "haven't"
#> 
#> [[51]]
#> [1] "having"
#> 
#> [[52]]
#> [1] "he"
#> 
#> [[53]]
#> [1] "he'd"
#> 
#> [[54]]
#> [1] "he'll"
#> 
#> [[55]]
#> [1] "he's"
#> 
#> [[56]]
#> [1] "her"
#> 
#> [[57]]
#> [1] "here"
#> 
#> [[58]]
#> [1] "here's"
#> 
#> [[59]]
#> [1] "hers"
#> 
#> [[60]]
#> [1] "herself"
#> 
#> [[61]]
#> [1] "him"
#> 
#> [[62]]
#> [1] "himself"
#> 
#> [[63]]
#> [1] "his"
#> 
#> [[64]]
#> [1] "how"
#> 
#> [[65]]
#> [1] "how's"
#> 
#> [[66]]
#> [1] "i"
#> 
#> [[67]]
#> [1] "i'd"
#> 
#> [[68]]
#> [1] "i'll"
#> 
#> [[69]]
#> [1] "i'm"
#> 
#> [[70]]
#> [1] "i've"
#> 
#> [[71]]
#> [1] "if"
#> 
#> [[72]]
#> [1] "in"
#> 
#> [[73]]
#> [1] "into"
#> 
#> [[74]]
#> [1] "is"
#> 
#> [[75]]
#> [1] "isn't"
#> 
#> [[76]]
#> [1] "it"
#> 
#> [[77]]
#> [1] "it's"
#> 
#> [[78]]
#> [1] "its"
#> 
#> [[79]]
#> [1] "itself"
#> 
#> [[80]]
#> [1] "let's"
#> 
#> [[81]]
#> [1] "me"
#> 
#> [[82]]
#> [1] "more"
#> 
#> [[83]]
#> [1] "most"
#> 
#> [[84]]
#> [1] "mustn't"
#> 
#> [[85]]
#> [1] "my"
#> 
#> [[86]]
#> [1] "myself"
#> 
#> [[87]]
#> [1] "no"
#> 
#> [[88]]
#> [1] "nor"
#> 
#> [[89]]
#> [1] "not"
#> 
#> [[90]]
#> [1] "of"
#> 
#> [[91]]
#> [1] "off"
#> 
#> [[92]]
#> [1] "on"
#> 
#> [[93]]
#> [1] "once"
#> 
#> [[94]]
#> [1] "only"
#> 
#> [[95]]
#> [1] "or"
#> 
#> [[96]]
#> [1] "other"
#> 
#> [[97]]
#> [1] "ought"
#> 
#> [[98]]
#> [1] "our"
#> 
#> [[99]]
#> [1] "ours"
#> 
#> [[100]]
#> [1] "ourselves"
#> 
#> [[101]]
#> [1] "out"
#> 
#> [[102]]
#> [1] "over"
#> 
#> [[103]]
#> [1] "own"
#> 
#> [[104]]
#> [1] "same"
#> 
#> [[105]]
#> [1] "shan't"
#> 
#> [[106]]
#> [1] "she"
#> 
#> [[107]]
#> [1] "she'd"
#> 
#> [[108]]
#> [1] "she'll"
#> 
#> [[109]]
#> [1] "she's"
#> 
#> [[110]]
#> [1] "should"
#> 
#> [[111]]
#> [1] "shouldn't"
#> 
#> [[112]]
#> [1] "so"
#> 
#> [[113]]
#> [1] "some"
#> 
#> [[114]]
#> [1] "such"
#> 
#> [[115]]
#> [1] "than"
#> 
#> [[116]]
#> [1] "that"
#> 
#> [[117]]
#> [1] "that's"
#> 
#> [[118]]
#> [1] "the"
#> 
#> [[119]]
#> [1] "their"
#> 
#> [[120]]
#> [1] "theirs"
#> 
#> [[121]]
#> [1] "them"
#> 
#> [[122]]
#> [1] "themselves"
#> 
#> [[123]]
#> [1] "then"
#> 
#> [[124]]
#> [1] "there"
#> 
#> [[125]]
#> [1] "there's"
#> 
#> [[126]]
#> [1] "these"
#> 
#> [[127]]
#> [1] "they"
#> 
#> [[128]]
#> [1] "they'd"
#> 
#> [[129]]
#> [1] "they'll"
#> 
#> [[130]]
#> [1] "they're"
#> 
#> [[131]]
#> [1] "they've"
#> 
#> [[132]]
#> [1] "this"
#> 
#> [[133]]
#> [1] "those"
#> 
#> [[134]]
#> [1] "through"
#> 
#> [[135]]
#> [1] "to"
#> 
#> [[136]]
#> [1] "too"
#> 
#> [[137]]
#> [1] "under"
#> 
#> [[138]]
#> [1] "until"
#> 
#> [[139]]
#> [1] "up"
#> 
#> [[140]]
#> [1] "very"
#> 
#> [[141]]
#> [1] "was"
#> 
#> [[142]]
#> [1] "wasn't"
#> 
#> [[143]]
#> [1] "we"
#> 
#> [[144]]
#> [1] "we'd"
#> 
#> [[145]]
#> [1] "we'll"
#> 
#> [[146]]
#> [1] "we're"
#> 
#> [[147]]
#> [1] "we've"
#> 
#> [[148]]
#> [1] "were"
#> 
#> [[149]]
#> [1] "weren't"
#> 
#> [[150]]
#> [1] "what"
#> 
#> [[151]]
#> [1] "what's"
#> 
#> [[152]]
#> [1] "when"
#> 
#> [[153]]
#> [1] "when's"
#> 
#> [[154]]
#> [1] "where"
#> 
#> [[155]]
#> [1] "where's"
#> 
#> [[156]]
#> [1] "which"
#> 
#> [[157]]
#> [1] "while"
#> 
#> [[158]]
#> [1] "who"
#> 
#> [[159]]
#> [1] "who's"
#> 
#> [[160]]
#> [1] "whom"
#> 
#> [[161]]
#> [1] "why"
#> 
#> [[162]]
#> [1] "why's"
#> 
#> [[163]]
#> [1] "will"
#> 
#> [[164]]
#> [1] "with"
#> 
#> [[165]]
#> [1] "won't"
#> 
#> [[166]]
#> [1] "would"
#> 
#> [[167]]
#> [1] "wouldn't"
#> 
#> [[168]]
#> [1] "you"
#> 
#> [[169]]
#> [1] "you'd"
#> 
#> [[170]]
#> [1] "you'll"
#> 
#> [[171]]
#> [1] "you're"
#> 
#> [[172]]
#> [1] "you've"
#> 
#> [[173]]
#> [1] "your"
#> 
#> [[174]]
#> [1] "yours"
#> 
#> [[175]]
#> [1] "yourself"
#> 
#> [[176]]
#> [1] "yourselves"

Generate the word cloud with preproccessed text and customized stopwords using CleverWordCloud()

CleverWordCloud() takes in two arguments, a character vector of words and a list of stopwords. It generates a png object and an html file in the current directory where the function is called. The word cloud is meaningful and reflects accurate frequencies without the influences of different tenses and/or various forms of the same word.

CleverWordCloud(stem_text, new_stopwords)