109 Quick preview
captions %>% head()
- Add a few tidy metrics on the captions (length, lexical variety proxy, sentiment proxy)
- caption_length: number of words
- word_count: number of tokens (rough heuristic)
- sentiment proxy: simple positive/negative word counts via tidytext’s bing lexicon
Code: library(tidytext)
metrics <- captions %>% mutate( caption_clean = str_squish(str_to_lower(caption)), caption_length = str_count(caption_clean, boundary(“word”)), # Simple lexical diversity proxy: unique words / total words words = str_split(caption_clean, “\s+”), unique_words = map_int(words, ~ n_distinct(.x)), lexical_diversity = ifelse(caption_length > 0, unique_words / caption_length, NA_real_) ) %>% # Sentiment proxy: count positive/negative words from Bing lexicon rowwise() %>% mutate( sentiment_score = { w <- unlist(words) if (length(w) == 0) 0 else { tok <- tibble(word = w) s <- tok %>% inner_join(get_sentiments(“bing”), by = “word”) # +1 per positive, -1 per negative sum(ifelse(s$sentiment == “positive”, 1, -1)) } } ) %>% ungroup() %>% select(meme_id, format, design, caption, caption_length, unique_words, lexical_diversity, sentiment_score)
metrics %>% head()
- Visualize results
- Compare designs A vs B across memes: caption length, sentiment, lexical diversity.
Code: # Caption length by design ggplot(metrics, aes(x = design, y = caption_length, fill = design)) + geom_boxplot() + facet_wrap(~ format) + labs(title = “Caption length by design (A vs B) across memes”, x = “Prompt Design”, y = “Caption length (words)”) + theme_minimal()