Cheat Sheet for Data Analytics | Study notes Data Analysis & Statistical Methods

Obj <- or =number/characters

Vector <- c(number, “value”)

Length(V) :

Min/max/range/mean/sum(V)

Setwd(“location”)

Dataframe <- read.csv(“file name”)

Summary(df): simple stats of each variable

#(min quartile median mean max)/N(length)

Head(df):first six observations

Tail(df): last six observations

Head(df$variable):first 6 obs. for variable

Same for mean/table – frequencies

Save file:

write.csv(resume, file="resume1.csv")

save(resume, file="resume1.RData")

!not &and |or!=not equal>=大于等于<=小于等于

==logical operator (e.g.5==6 false)

!=not true (e.g. 5!=6 True)

newV <- V2>V1 drop obs. V2>V1

df<- c(1,2,3,4)

df(3) = 3/df(-3) = 1 2 4

[] indexing:

df[c(1,3)] = 1 3

df[c(TRUE, FALSE, TRUE, FALSE)]

= 1, 3 (only 1st & 3rd is true)

school <- c("UCSD", "UCB", "UCLA", "UCR")

school=="UCSD"

[1] TRUE FALSE FALSE FALSE

Subset graduationdate vector

graduationdates[school=="UCSD"]

[1] 2010

Conditional vector

Vec <- df$variable == “condition”

Frequency of conditional vector

Sum(df$variable==“condition”)

= # of obs.

Mean of V1 with the condition of V2

Mean(df$variable[df$variable2])

Subsetting data frames

students <- data.frame(school=c("UCSD",

"UCB", "UCSD"),

graduationdate=c(2010, 2019, 2015))=

students

school graduationdate

1 UCSD 2010

2 UCB 2019

3 UCSD 2015

Specify data frame, row, column:

Students[3,1] / [1,] / [,2]

[1] “UCSD” / 1 UCSD 2010/2010 2019 2015

Extract row 1&3 w/o 2

students[c(1,3),] (or use students[-2,])

school graduationdate

1 UCSD 2010

3 UCSD 2015

Extract where ==“UCSD” is true

students school=="UCSD"

[1] TRUE FALSE TRUE

⬆️(return logical factor)⬇️ filter

students[students$school=="UCSD",]

school graduationdate

1 UCSD 2010

3 UCSD 2015

Tidyverse: subset(df,logical statement)

subset(students,students$school=="UCSD")

school graduationdate

1 UCSD 2010

3 UCSD 2015

Create dataframe for subset

resume_blacknames <-resume[resume$race=="black",]

resume_whitenames <-

resume[resume$race=="white",]

mean of subset df

mean(resume_blacknames$variable)

[1] 0.06447639

CONDITIONAL STATEMENTS

if (logical statement) {

code to be executed if logical statement is TRUE

}

EG) door <- "locked"

if (door=="locked") {

print("sorry, you need a key to enter")

}

[1] "sorry, you need a key to enter"

EG2) door <- "unlocked"

if (door=="locked") {

print("sorry, you need a key to enter")

} NOT EXECUTED AS DOOR LOCKED NOT TRUE

ELSE EG) door <- "locked"

if (door=="locked"){

print("Sorry, you need a key to enter")

} else { {and else same line must

print("Please Come in!")

}

[1] "Sorry, you need a key to enter"

Executes else if door<-“unlocked”

[1]”please come in!”

Loops:

for (some set of things) {

do some stuff

}

EG) i <- 2

print(2*i)

[1] 4

EG2) for (i in c(3,10,99)){

print(2*i)

}

[1] 6 [1] 20 [1] 198

EG3) homework <- c("math", "reading", "writing")

for (i in homework) {

cat("Do", i, "\n")

}

Do math

Do reading

Do writing

*cat(): concatenates & prints

*\n: display in next line

Same process by looping over words in hw:

for (i in 1:length(homework)) {

cat("Do", homework[i], "\n")

}

Dimensions: Dim(df)

Subset individuals (age>=25 & age<=34 & mother2==1):

mothers2534 <- subset(df, mother2==1 & age>=25 &

age<=34)

install tidyverse

install.packages("tidyverse")

library(tidyverse)

convert data frame to tibble:

rr <- as_tibble(rr)

retrieves rows of data that meet certain condition:

filter(dataframe, some logical statement)

EG) mothers2534 <- filter(rr, age<=34 & age>=25 &

mother2==1)

Or EG) filter(mothers2534, dataset%in%2003:2008)

Pipe eg) mothers2534 <- rr %>% filter(age<=34 &

age>=25 & mother2==1)

Select remain tibble wanted:

Eg) mothers2534 <- select(mothers2534, dataset,

mother2, age, childtot)

Sort data based on values of variable:

mothers2534 <- arrange(mothers2534,age)

want descending order:

mothers2534 <- arrange(mothers2534,desc(age))

head(mothers2534$age)

[1] 34 34 34 34 34 34

Pipe operator:

F(x) = x%>% f()

For multiple functions:

h(g(f(x)))

x %>%

f %>%

g %>%

1st filter 2nd selected variables needed:

mothers2534 <- rr %>%

filter(age<=34 & age>=25 & mother2==1) %>%

select(dataset, mother2, age, childtot)

*if want descending order add:

%>%

arrange(desc(age))

new variable

df$newvarname <- expression

eg) rr$childcollegeprep <- rr$childeduc + rr$childtravel

or: mutate(dataframe, newvarname = expression)

eg) rr <- mutate(rr,

childcollegeprep=childtravel+childeduc)

can create mutiple new var at same time:

rr <- mutate(rr, childcollegeprep=childtravel+childeduc,

childnotcollegeprep=childtot-childcollegeprep)

create new var, drop all prior var:

collegeprepdat <- transmute(rr,

childcollegeprep = childeduc + childtravel,

childnotcollegeprep = childtot - collegeprep)

Error in `transmute()`:

ℹ In argument: `childnotcollegeprep = childtot -

collegeprep`.

Caused by error:

! object 'collegeprep' not found

Summarize() – generate summary stats:

summarize(rr, meanchildtot = mean(childtot, na.rm=T))

# A tibble: 1 × 1

meanchildtot

<dbl>

1 4.69

*1st sum(rr = generating summary stats from dataset rr

*2 meanchiltot = giving name to sum stats

*na.rm=T ignores missing values

Can compute multiple var at once:

summarize(rr, meanchildtot = mean(childtot, na.rm=T),

medianchildtot=median(childtot, na.rm = T))

mean of chiltot taken by each value of dataset, store

under meanchildtot:

rr %>%

group_by(dataset) %>%

summarize(meanchildtot=mean(childtot,na.rm=T))

create unique combinations:A 2010, A 2015, B2010,

B2015

student.df %>%

group_by(school,graduationdate) %>%

summarize(mean.gpa=mean(gpa))

`summarise()` has grouped output by 'school'. You can

override using the `.groups` argument.

# A tibble: 4 × 3

# Groups: school [2]

school graduationdate mean.gpa

1 A 2010 3.45

2 A 2015 2.9

3 B 2010 3.6

4 B 2015 1.8

Save output:

totchildbyyearcollege <- rr %>%

group_by(dataset, college) %>%

summarize(meanchildtot = mean(childtot, na.rm=T))

combination:

collegeprep <- rr %>%

filter(mother2==1, age>24, age<35) %>%

mutate(collegeprep = childeduc + childtravel) %>%

group_by(dataset, college) %>%

summarize(meancollegeprep=mean(collegeprep,

na.rm=T))

`summarise()` has grouped output by 'dataset'. You can

override using the `.groups` argument.

Filter wanted mother data, creat new var, group dta aby

dataset and college & summarize mean

Create histogram:

hist(pm_bycity$meanpm10,

xlab="Mean PM10",

ylab="Frequency",

main="Mean PM10 by City")

create multiple plots:

par(mfrow=c(1,2))

SINGLE ROW TWO COLUMNS (indicate with mfrow=)

#Smaller bins

hist(pm_bycity$meanpm10, xlab="Mean PM10",

ylab="Frequency", main="Mean PM10 by City",

breaks=20)

#Larger bins

hist(pm_bycity$meanpm10, xlab="Mean PM10",

ylab="Frequency", main="Mean PM10 by City",

breaks=4)

new var = T deciding date >/<auto_date:

pm_bycitybefore <- pm %>%

mutate(T=date-auto_date) %>%

filter(T<0) %>%

group_by(code_city) %>%

summarize(meanpm10 = mean(pm10, na.rm=TRUE))

pm_bycityafter <- pm %>%

mutate(T=date-auto_date) %>%

filter(T > 0) %>%

group_by(code_city) %>%

summarize(meanpm10 = mean(pm10, na.rm=TRUE))

comparing histograms:

par(mfrow=c(1,2))

#Plot histogram for before

hist(pm_bycitybefore$meanpm10, xlab="MeanPM10",

ylab="Frequency",

main="Before Automation")

#Plot histogram for after

hist(pm_bycityafter$meanpm10, xlab="MeanPM10",

ylab="Frequency",

main="After Automation")

same scale for both histograms:

#Create two panes for plots

par(mfrow=c(1,2))

#Plot histogram for before

hist(pm_bycitybefore$meanpm10, xlab="MeanPM10",

ylab="Frequency",

main="Before automation",

xlim=c(0,250), ylim=c(0,50))

lines(c(meanbefore, meanbefore), c(-10, 100),

lty=2, col="red")

#Plot histogram for after

hist(pm_bycityafter$meanpm10, xlab="MeanPM10",

ylab="Frequency",

main="After automation",

xlim=c(0,250), ylim=c(0,50))

lines(c(meanbefore, meanbefore), c(-10, 100),

lty=2, col="red")

boxplot function:

boxplot(pm_byday$meanpm10)

lines that extend from the box are referred to as

whiskers

whiskers – 1.5xIQR

IQR = Q3-Q1

*Horizontal, blue, with label box plot:

boxplot(pm_byday$meanpm10, xlab="Mean PM10",

main="Mean PM 10 by city",

col="blue", border="darkblue",

pch=16, horizontal=T)

separate box plots:

(1st add new var)

pm_byday$after <- pm_byday$T>0

boxplot(pm_byday$meanpm10 ~ pm_byday$after,

xlab="Mean PM10", ylab="After Automation",

main="Mean PM 10 by Day Before and After

Automation",

col="blue", border="darkblue",

pch=16, horizontal=T)

scatter plot:

plot(df$xvar, df$yvar)

*pch = .pch plot size

*lines() add line

EG) plot(pm_byday$T, pm_byday$meanpm10,

xlab="Days Relative to Automation",

ylab="Mean PM10",

main="Automation and Mean PM10",

pch=16, ylim=c(0,300))

lines(c(0,0), c(-10,250), col="red", lty=2)

text(0,260, "Automation", col="red")

save plot as pdf:

pdf("Automation.pdf")

dev.off() stop saving into pdf

library(lubridate)

today(), now()showsh date

*ymd(“2012-01-22”) convert string into date obj. can

also mdy/dmy/ymd

*can add days()/months()/years()

*portion – e.g year

year(ymd("2012-01-22"))

#> [1] 2012

*add

ymd("1960-01-01")+days(18628)

#> [1] "2011-01-01"

Plot mean amount of rain by month:

pm_bymonth <- pm %>%

mutate(rdate = ymd("1960-01-01") + days(date),

month = month(rdate)) %>%

group_by(month) %>%

summarize(meanrain = mean(rain, na.rm=TRUE))

plot(pm_bymonth$month, pm_bymonth$meanrain,

col="blue",

pch=16,

xlab="Date",

Cheat Sheet for Data Analytics, Study notes of Data Analysis & Statistical Methods

Related documents

Partial preview of the text

Download Cheat Sheet for Data Analytics and more Study notes Data Analysis & Statistical Methods in PDF only on Docsity!

A tibble: 1 × 1

A tibble: 4 × 3

Groups: school [2]

GGPLOT – SCATTER PLOT