suppressPackageStartupMessages({
library(meetupr)
library(jsonlite)
library(tidyverse)
library(stringr)
library(tidytext)
library(wordcloud)
library(topicmodels)
library(broom)
library(scales)
})
theme_set(theme_bw())
Overview
Let’s scrap the R-Ladies chapters events from Meetup.com We can use the {meetupr}
package.
urlname <- c("rladies-paris","rladies-rome")
events <- purrr::map(urlname,get_events)
dat <- dplyr::bind_rows(events)
Load necessary libraries
R-Ladies Rome Events
<- "rladies-rome"
urlname <- get_events(urlname)
events ::arrange(events, desc(time))%>%
dplyrhead()
<- c("rladies-paris","rladies-rome")
urlname <- purrr::map(urlname,get_events)
events <- dplyr::bind_rows(events)
dat %>%
datmutate(link=gsub("https://www.meetup.com/","",link),
chapter=stringr::str_extract(link, "^rladies(.+?)/"),
chapter=gsub("/","",chapter))%>%
count(chapter)
All Chapters Events
To do it for all chapters on meetup, we need the list of the chapters from the rladies github archive.
<- jsonlite::fromJSON('https://raw.githubusercontent.com/rladies/meetup_archive/main/data/events.json') data
<- data %>%
chapter count(group_urlname)%>%
filter(!str_detect(group_urlname,"@"))
<- chapter$group_urlname
chapters
<- purrr::map(chapters,get_events)
events # saveRDS(events,"events.rds")
# another way
# x <- lapply(paths, func)
# res <- dplyr::bind_rows(x)
bind_rows(events[1])%>%head()
<- dplyr::bind_rows(events)
dat # saveRDS(dat,"dat.rds")
<- dat%>%
dat1 mutate(link=gsub("https://www.meetup.com/","",link),
chapter=stringr::str_extract(link, "^rladies(.+?)/"),
chapter=gsub("/","",chapter))%>%
relocate(chapter)
# saveRDS(dat1,"dat1.rds")
<- dat1%>%
dat2 select(time,chapter,title,going,venue_city,
%>%
venue_lon,venue_lat,venue_state,venue_country)mutate(time=as.Date(time))%>%
arrange(desc(going))
%>%
dat2mutate(year=year(time),.after = time)%>%
pull(year)%>%
summary(year)
<- dat2%>%
dat3 ::unnest_tokens(word, title,drop = F)%>%
tidytextselect(chapter,title,going,word)%>%
anti_join(get_stopwords())%>%
filter(!str_length(word)<=3)
%>%
dat3count(word, sort = TRUE) %>%
with(wordcloud::wordcloud(word, n, max.words = 100))
Latent Dirichlet Allocation with the topicmodels package
<- dat3 %>%
chapters_dtm count(title, word, sort = TRUE)%>%
cast_dtm(title, word, n)
chapters_dtm
<- topicmodels::LDA(chapters_dtm,
chapters_lda k = 4,
control = list(seed = 1234))
<- tidy(chapters_lda)
chapters_lda_td
chapters_lda_td<- chapters_lda_td %>%
top_terms group_by(topic) %>%
slice_max(beta, n = 5) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
%>%
top_terms mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta)) +
geom_col() +
scale_x_reordered() +
facet_wrap(vars(topic), scales = "free_x")
<- augment(chapters_lda, data = chapters_dtm)
assignments
%>%
assignmentsfilter(!term=="ladies")
# how words in titles changed overtime
<- dat3 %>%
inaug_freq inner_join(dat2,by=c("chapter","title","going"))%>%#View
count(time, word) %>%
complete(time, word, fill = list(n = 0)) %>%
group_by(time) %>%
mutate(time_total = sum(n),
percent = n / time_total) %>%
ungroup()
inaug_freq
# library(broom)
<- inaug_freq %>%
models group_by(word) %>%
filter(sum(n) > 50) %>%
group_modify(
~ tidy(glm(cbind(n, time_total - n) ~ time, .,
family = "binomial"))
%>%
) ungroup() %>%
filter(term == "time")
models%>%
models filter(term == "time") %>%
arrange(desc(abs(estimate)))
%>%
models mutate(adjusted.p.value = p.adjust(p.value)) %>%
ggplot(aes(estimate, adjusted.p.value)) +
geom_point(shape=".") +
#scale_y_log10() +
geom_text(aes(label = word),
#vjust = 1, hjust = 1,
check_overlap = TRUE) +
labs(x = "Estimated change over time", y = "Adjusted p-value")
%>%
models slice_max(abs(estimate), n = 6) %>%
inner_join(inaug_freq) %>%
ggplot(aes(time, percent)) +
geom_point() +
geom_smooth() +
facet_wrap(vars(word)) +
scale_y_continuous(labels = scales::percent_format()) +
labs(y = "Frequency of word in speech")
Contacts and Information: fede.gazzelloni@gmail.com
Back to top