90 Lecture: Applications of Large Language Models in Data Science
You can follow along with the slides here if they do not appear below.
90.1 Use Cases in Data Science
90.1.1 R Example: Text Classification (Sentiment Analysis)
library(tidytext)
#> Warning: package 'tidytext' was built under R version 4.4.2
library(dplyr)
library(ggplot2)
# Sample text data
texts <- c(
"I love this product! It's amazing.",
"This is terrible. I hate it.",
"It's okay, nothing special.",
"Wow, absolutely fantastic experience!",
"Disappointed with the quality."
)
# Create a tibble
df <- tibble(text = texts)
# show the data
df %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing"))
#> Joining with `by = join_by(word)`
#> # A tibble: 7 × 2
#> word sentiment
#> <chr> <chr>
#> 1 love positive
#> 2 amazing positive
#> 3 terrible negative
#> 4 hate negative
#> 5 wow positive
#> 6 fantastic positive
#> 7 disappointed negative
## Tokenize and Perform Sentiment Analysis
sentiment_scores <- df %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment_score = positive - negative)
#> Joining with `by = join_by(word)`
sentiment_scores
#> # A tibble: 1 × 3
#> negative positive sentiment_score
#> <dbl> <dbl> <dbl>
#> 1 3 4 1
## Visualize Sentiment Analysis Results
df %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
ggplot(aes(x = sentiment)) +
geom_bar()+
coord_flip() + theme_classic()+
labs(title = "Overall Vibes", x = "", y = "Count")
#> Joining with `by = join_by(word)`
90.1.2 Text Generation (Simple Markov Chain)
library(tidytext)
library(dplyr)
library(stringr)
# Sample text
text <- "The quick brown fox jumps over the lazy dog. The dog barks at the fox. The fox runs away quickly."
# Tokenize and create word pairs
word_pairs <- tibble(text = text) %>%
unnest_tokens(word, text) %>%
mutate(next_word = lead(word)) %>%
na.omit()
# Create a simple Markov chain
markov_chain <- word_pairs %>%
group_by(word) %>%
summarise(next_words = list(next_word))
# Generate text
generate_text <- function(start_word, length = 10) {
result <- start_word
current_word <- start_word
for (i in 1:length) {
next_word_options <- markov_chain %>%
filter(word == current_word) %>%
pull(next_words) %>%
unlist()
if (length(next_word_options) == 0) break
next_word <- sample(next_word_options, 1)
result <- c(result, next_word)
current_word <- next_word
}
str_c(result, collapse = " ")
}
# Generate a sentence
generate_text("the")
#> [1] "the lazy dog barks at the lazy dog the quick brown"
library(tidytext)
library(dplyr)
context <- "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.
It is named after the engineer Gustave Eiffel, whose company designed and built the Tower."
question <- "Who designed the Eiffel Tower?"
# Tokenize context and question
context_tokens <- tibble(text = context) %>%
unnest_tokens(word, text)
question_tokens <- tibble(text = question) %>%
unnest_tokens(word, text)
# Simple word overlap for demonstration
matching_words <- intersect(context_tokens$word, question_tokens$word)
# Find sentence with most matching words
sentences <- tibble(text = context) %>%
unnest_tokens(sentence, text, token = "sentences")
best_sentence <- sentences %>%
mutate(matches = sapply(sentence, function(s) {
sum(matching_words %in% unlist(strsplit(s, " ")))
})) %>%
arrange(desc(matches)) %>%
slice(1)
print(best_sentence$sentence)
#> [1] "the eiffel tower is a wrought-iron lattice tower on the champ de mars in paris, france."