dbplyr-SQL

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.2      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(RMySQL)
Loading required package: DBI
db <- dbConnect(
  MySQL(),
  host = "scidb.smith.edu",
  user = "sds192",
  password = "DSismfc@S",
  dbname = "yelp"
)
knitr::opts_chunk$set(connection = db, max.print = 20)
class(db)
[1] "MySQLConnection"
attr(,"package")
[1] "RMySQL"

SHOW DATABASES;
7 records
Database
airlines
citibike
fec
imdb
information_schema
lahman
yelp
USE yelp;

SHOW TABLES;
3 records
Tables_in_yelp
businesses
reviews
users

DESCRIBE users;
4 records
Field Type Null Key Default Extra
user_id varchar(22) NO PRI NA
name varchar(255) NO NA
average_stars decimal(3,2) NO NA
review_count smallint NO NA

DESCRIBE businesses;
8 records
Field Type Null Key Default Extra
id varchar(22) NO PRI NA
name varchar(255) NO NA
stars smallint NO NA
open varchar(5) NO NA
review_count smallint NO NA
city varchar(255) NO NA
longitude decimal(15,12) NO NA
latitude decimal(15,12) NO NA
DESCRIBE reviews;
3 records
Field Type Null Key Default Extra
business_id varchar(22) NO MUL NA
user_id varchar(22) NO MUL NA
stars smallint NO NA
#This creates a tbl_sql object in R called airports


users <- db %>%
  tbl("users")
Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
numeric
reviews <- db %>%
  tbl("reviews")
businesses <- db |>
  tbl("businesses")
Warning in .local(conn, statement, ...): Decimal MySQL column 6 imported as
numeric
Warning in .local(conn, statement, ...): Decimal MySQL column 7 imported as
numeric
# 
# print(object.size(airports),units = "Kb")
# class(airports)
#We can do joins too. 

average_star_5 <- users |>
  
  inner_join(reviews,by ="user_id") |>
  inner_join(businesses,by = c("business_id"="id"),suffix=c(".users",".business")) |>
  filter(stars.business > 4.99) |>
  
  count(city,sort = TRUE)

#The collect() function brings a tabl_sql into R as a tbl_df uses more space.
  # collect()


print(object.size(average_star_5),units="Kb")
33.1 Kb
#tail(average_star_5)
city_with_tasty_food <- businesses |>
  filter(stars > 4.99) |>
  count(city,sort = TRUE)

#show_query(city_with_tasty_food)
SELECT city, COUNT(*) AS n
FROM businesses
WHERE stars > 4.99 
GROUP BY city
ORDER BY n DESC;
Displaying records 1 - 20
city n
Phoenix 1067
Scottsdale 600
Tempe 284
Mesa 236
Chandler 207
Glendale 151
Gilbert 111
Peoria 71
Surprise 36
Avondale 34
Queen Creek 30
Goodyear 26
Cave Creek 25
Fountain Hills 15
Apache Junction 15
Paradise Valley 12
Buckeye 11
Casa Grande 8
Maricopa 8
Wickenburg 7