R nanocourse Perspective: Mapping UK Tweets

Mapping a sentiment analysis of UK tweets per population and income for each UK district.

Table of Contents


By using SKEMA Quantum Studio framework (Warin 2019), this course will teach you how to map a sentiment analysis of UK tweets per population and income for each UK district.

Loading packages


library(rgdal)
library(leaflet)
library(leaflet.extras)
library(RColorBrewer)
library(tidyverse)
library(tools)
library(reshape2)
library(ggplot2)
library(ggridges)
library(lubridate)
library(rtweet)
library(maps)
library(quanteda)
library(wordcloud)
library(tidytext)
library(sf)
knitr::opts_chunk$set(echo = TRUE, eval=FALSE)

Income by district


## INCOME BY DISTRICT

library(gsheet)
income_2012 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=2078260586")
income_2013 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=1695992236")
income_2014 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=0")
income_2015 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=725888174")

income <- bind_rows(income_2012, income_2013)
income <- bind_rows(income, income_2014)
income <- bind_rows(income, income_2015)

names(income)[names(income) == "districts"] <- "lad17nm"

income$lad17nm <- gsub(" UA", "", income$lad17nm)
income$lad17nm <- gsub(" Towns", "", income$lad17nm)
income$lad17nm <- gsub("-", " ", income$lad17nm)
income$lad17nm <- gsub("The ", "", income$lad17nm)
income$lad17nm <- gsub(" City of$", "", income$lad17nm)
income$lad17nm <- gsub("Edinburgh", "City of Edinburgh", income$lad17nm)
income$lad17nm <- gsub("Down", "Newry, Mourne and Down", income$lad17nm)
income$lad17nm <- gsub("Newry and Mourne", "Newry, Mourne and Down", income$lad17nm)
income$lad17nm <- gsub("Rhondda Cynon Taff", "Rhondda Cynon Taf", income$lad17nm)
income$lad17nm <- gsub("South Buckinghamshire", "South Bucks", income$lad17nm)
income$lad17nm <- gsub("Comhairle nan Eilean Siar", "Na h Eileanan Siar", income$lad17nm)

income[which(income[,1]=="Antrim"),1] <- "Antrim and Newtownabbey"
income[which(income[,1]=="Newtownabbey"),1] <- "Antrim and Newtownabbey"
income[which(income[,1]=="Armagh"),1] <- "Armagh City, Banbridge and Craigavon"
income[which(income[,1]=="Banbridge"),1] <- "Armagh City, Banbridge and Craigavon"
income[which(income[,1]=="Craigavon"),1] <- "Armagh City, Banbridge and Craigavon"
income[which(income[,1]=="Derry"),1] <- "Derry City and Strabane"
income[which(income[,1]=="Strabane"),1] <- "Derry City and Strabane"
income[which(income[,1]=="Fermanagh"),1] <- "Fermanagh and Omagh"
income[which(income[,1]=="Omagh"),1] <- "Fermanagh and Omagh"
income[which(income[,1]=="Lisburn"),1] <- "Lisburn and Castlereagh"
income[which(income[,1]=="Castlereagh"),1] <- "Lisburn and Castlereagh"
income[which(income[,1]=="Ards"),1] <- "Ards and North Down"
income[which(income[,1]=="North Down"),1] <- "Ards and North Down"

income <- aggregate(income_tot_mean ~ lad17nm + year, data = income, mean)
income$income_tot_mean <- round(income$income_tot_mean, digits = 0)

income$lad17nm <- toTitleCase(income$lad17nm)

Population by district


## POPULATION BY DISTRICT

population <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=1762880462")

population <- population[, -c(14:22)]

population$`Area2(sqkm)` <- NULL
population$Geography <- NULL
population$Code <- NULL

population <- gather(population, "year", "population", 2:10)

names(population)[names(population) == "Name"] <- "lad17nm"

population$year <- as.numeric(population$year)

population <- filter(population, year == 2012 | year == 2013 | year == 2014 | year == 2015)

population$lad17nm <- gsub(", City of", "", population$lad17nm)
population$lad17nm <- gsub(", County of", "", population$lad17nm)
population$lad17nm <- gsub("-", " ", population$lad17nm)
population$lad17nm <- gsub("'", "", population$lad17nm)
population$lad17nm <- gsub("St ", "St. ", population$lad17nm)
population$lad17nm <- gsub("Folkstone and Hythe", "Shepway", population$lad17nm)

population$lad17nm <- toTitleCase(population$lad17nm)

Income and Population by district


## INCOME AND POPULATION 

district_data <- left_join(population, income, by = c("lad17nm", "year"))

Tweets


## LOADING TWEETS
tweets.overall <- read_csv("~/onGoingResearch/articlesGisRefugeeCrisis/developpement/gisRefugeeCrisis_resources/development/data/tweets/tweets.overall.csv")

## KEEPING TWEETS OF UK
tweets.overall.LatLong <- filter(tweets.overall, latitude >= 49.771686 & latitude <= 60.862568)
tweets.overall.LatLong <- filter(tweets.overall.LatLong, longitude >= -12.524414 & longitude <= 1.785278)

## TWEETS MINING
tweets <- tweets.overall.LatLong

tweets.overall.LatLong$year <- substr(tweets.overall.LatLong$date, 0, 4)

tweets.LatLong <- tibble(line = 1:nrow(tweets.overall.LatLong), 
                         year = tweets.overall.LatLong$year, 
                         latitude = tweets.overall.LatLong$latitude, 
                         longitude = tweets.overall.LatLong$longitude)

# Cleaning
text <- tweets$content

# remove retweet entities
text <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text)
# remove at people
text <- gsub("@\\w+", "", text)
# remove punctuation
# text <- gsub("[[:punct:]]", "", text)
# remove numbers
text <- gsub("[[:digit:]]", "", text)
# remove html links
text <- gsub("http\\w+", "", text)
# remove all pictwitter
text <- gsub("pictwitter\\w+ *", "", text)
# Remove chinese language
text <- iconv(text, "latin1", "ASCII", sub="")

# Tibble format
text_df <- tibble(line = 1:length(text), text = text)

# Tokenization 
tidy_tweets <- text_df %>% 
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")

# Join tweets to longitude and latitude by line number
tidy_tweets_LatLong <- left_join(tidy_tweets, tweets.LatLong, by = "line")

# Words that contribute to positive and negative sentiment
AFINN <- get_sentiments("afinn")

afinn_word_LatLong <- tidy_tweets_LatLong %>%
  inner_join(AFINN, by = "word")

afinn_word_LatLong_Tot <- aggregate(value ~ line + year + latitude + longitude, afinn_word_LatLong, sum)

afinn_word_LatLong_Tot$sentiment <- ifelse(afinn_word_LatLong_Tot$value > 0, "positive", 
                                           ifelse(afinn_word_LatLong_Tot$value == 0, "neutral", "negative"))

afinn_word_LatLong_Tot_PN <- filter(afinn_word_LatLong_Tot, sentiment != "neutral")

Mapping the data


## SHAPEFILE MAP DISTRICT UK

district <- readOGR(dsn = "./shapefiles/ladUK", 
                    layer = 'Local_Authority_Districts_December_2017_Full_Clipped_Boundaries_in_United_Kingdom_WGS84')

district@data$lad17nm <- gsub(", City of", "", district@data$lad17nm)
district@data$lad17nm <- gsub(", County of", "", district@data$lad17nm)
district@data$lad17nm <- gsub("-", " ", district@data$lad17nm)
district@data$lad17nm <- gsub("'", "", district@data$lad17nm)
district@data$lad17nm<- gsub("St ", "St. ", district@data$lad17nm)

district@data$lad17nm <- toTitleCase(district@data$lad17nm)

map <- read_sf("./shapefiles/ladUK/Local_Authority_Districts_December_2017_Full_Clipped_Boundaries_in_United_Kingdom_WGS84.shp")

map$lad17nm <- gsub(", City of", "", map$lad17nm)
map$lad17nm <- gsub(", County of", "", map$lad17nm)
map$lad17nm <- gsub("-", " ", map$lad17nm)
map$lad17nm <- gsub("'", "", map$lad17nm)
map$lad17nm<- gsub("St ", "St. ", map$lad17nm)

map$lad17nm <- toTitleCase(map$lad17nm)

pnts <- afinn_word_LatLong_Tot_PN

pnts_sf <- st_as_sf(pnts, coords = c('longitude', 'latitude'), crs = st_crs(map))

pnts <- pnts_sf %>% mutate(
  intersection = as.integer(st_intersects(geometry, map)), 
  lad17nm = if_else(is.na(intersection), '', map$lad17nm[intersection])
) 

pnts <- na.omit(pnts)

lll <- select(afinn_word_LatLong_Tot_PN, line, year, longitude, latitude)

pnts <- left_join(pnts, lll, by = c("line", "year"))

pnts$year <- as.numeric(pnts$year)

tweets_sentiment_income_pop_latlong <- left_join(pnts, district_data, by = c("lad17nm", "year"))

tweets_sentiment_income_pop_latlong$intersection <- NULL
tweets_sentiment_income_pop_latlong$geometry <- NULL

districtID <- select(district@data, objectid, lad17cd, lad17nm)

tweets_sentiment_income_pop_latlong <- left_join(tweets_sentiment_income_pop_latlong, districtID, by = "lad17nm")

tweets_sentiment_income_pop_latlong_final <- select(tweets_sentiment_income_pop_latlong, 
                                                    line, sentiment, value, longitude, 
                                                    latitude, year, lad17nm, lad17cd, 
                                                    objectid, population, income_tot_mean)

names(tweets_sentiment_income_pop_latlong_final)[names(tweets_sentiment_income_pop_latlong_final) == "objectid"] <- "lad17id"

write.csv(tweets_sentiment_income_pop_latlong_final, "tweets_sentiment_income_pop_latlong_final.csv")

## MAP DISTRICT UK

district@data <- left_join(district@data, district_data, by = "lad17nm")

bins <- c(20000, 30000, 40000, 50000, 60000, 70000, 100000, 150000, 200000)
pal <- colorBin("YlOrRd", domain = district@data$income_tot_mean, bins = bins)

labels <- sprintf(
  "<strong>%s</strong><br/>%g £",
  district@data$lad17nm, district@data$income_tot_mean
) %>% lapply(htmltools::HTML)

palTweets <- colorFactor(c("limegreen", "red"), domain = c("positive", "negative"))

leaflet() %>%
  setView(-0.118092, 51.509865, 4) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addFullscreenControl() %>%
  addPolygons(data = district,
              fillColor = ~pal(district@data$income_tot_mean),
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "2",
              fillOpacity = 0.7,
              highlight = highlightOptions(
                weight = 3,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = FALSE),
              label = labels,
              labelOptions = labelOptions(
                style = list("font-weight" = "normal", padding = "3px 8px"),
                textsize = "15px",
                direction = "auto")) %>% 
  addLegend(pal = pal, 
            values = district@data$income_tot_mean, 
            opacity = 0.7, 
            title = "Average Total Income",
            position = "bottomright") %>%
  addCircleMarkers(data = afinn_word_LatLong_Tot_PN, lng = ~longitude, lat = ~latitude,
    radius = 1,
    color = ~palTweets(sentiment),
    stroke = FALSE, 
    fillOpacity = 1
  )

Warin, Thierry. 2019. “SKEMA Quantum Studio: A Technological Framework for Data Science in Higher Education.” https://doi.org/10.6084/m9.figshare.8204195.v2.

Citation

For attribution, please cite this work as

Warin (2020, April 15). Virtual Campus: R nanocourse Perspective: Mapping UK Tweets. Retrieved from https://virtualcampus.skemagloballab.io/posts/Rnanocourseperspective-mappinguktweets/

BibTeX citation

@misc{warin2020r,
  author = {Warin, Thierry},
  title = {Virtual Campus: R nanocourse Perspective: Mapping UK Tweets},
  url = {https://virtualcampus.skemagloballab.io/posts/Rnanocourseperspective-mappinguktweets/},
  year = {2020}
}