What is the value of mass_index
after running these commands?
mass <- 47.5
age <- 100
mass_index <- mass/age
mass <- mass * 2.0
What type of vector do you get in each of the following cases (hint: use class()
)?
num_char <- c(1, 2, 3, "a")
num_logical <- c(1, 2, 3, TRUE)
char_logical <- c("a", "b", "c", TRUE)
tricky <- c(1, 2, 3, "4")
Using this vector:
heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)
# !is.na() is used to ask the question "Is NOT missing?"
heights_no_na <- heights[!is.na(heights)]
# The median function has an option called "na.rm"
## note that not all functions have this option! Always check the help page of the functions
median(heights, na.rm = TRUE)
# Alternatively, we could use the version we created without missing values
median(heights_no_na)
# Because logical values are stored as TRUE = 1 and FALSE = 0, if we sum a
## logical vector it's equivalent to counting cases where this was true
sum(heights_no_na > 67)
data.frame
manipulationsurveys
have? (hint: functions ncol()
and nrow()
)nrow(surveys)
ncol(surveys)
glimpse()
)glimpse(surveys)
sum()
on a logical vector)
# We can sum the output of the is.na function, which is a logical vector
sum(is.na(surveys$weight))
# Extra: divide by total number of observations
sum(is.na(surveys$weight))/nrow(surveys)
unique()
and length()
)length(unique(surveys$genus))
dplyr
Using pipes, subset the surveys data to include animals collected before 1995 and retain only the columns year, sex, and weight.
# Take the surveys table, then filter for year < 1995, then select relevant columns
surveys_pre1995 <- surveys %>%
filter(year < 1995) %>%
select(year, sex, weight)
The answer should have 21486 rows of data.
optional:
surveys %>%
filter(year >= 1980 & year < 1990 & taxa == "Reptile")
# Get genus and plot_id of animals collected in 1995
surveys %>%
select(genus, plot_id) %>%
filter(year == 1995)
# Filter to retain cases where the values of year are equal to the minimum of that same year column
surveys %>%
filter(year == min(year))
Create a new data frame from the surveys data that meets the following criteria: contains only the species_id
column and a new column called hindfoot_half
containing values that are half the hindfoot_length
values. In this hindfoot_half
column, there are no NAs
and all values are less than 30.
Hint: think about how the commands should be ordered to produce this data frame!
surveys_hindfoot_half <- surveys %>%
filter(!is.na(hindfoot_length)) %>%
mutate(hindfoot_half = hindfoot_length / 2) %>%
filter(hindfoot_half < 30) %>%
select(species_id, hindfoot_half)
The answer should have 31436 rows and 2 columns.
optional:
log2()
function) is either greater than 2 or smaller than -2. The result should have 805 rows.surveys %>%
mutate(weight_hind_ratio = weight/hindfoot_length) %>%
filter(log2(weight_hind_ratio) > 2 | log2(weight_hind_ratio) < -2)
plot_type
surveyed?# Use the count function to count how many in each plot_type
surveys %>%
count(plot_type)
sum(!is.na(x))
to count how many non-missing values there are in a vector).# for each species
# calculate mean, min and max,
# number of observations per species
# and number of non-missing hindfoot_length values
surveys %>%
group_by(species_id) %>%
summarize(
mean_hindfoot_length = mean(hindfoot_length, na.rm = TRUE),
min_hindfoot_length = min(hindfoot_length, na.rm = TRUE),
max_hindfoot_length = max(hindfoot_length, na.rm = TRUE),
n = n(),
n_hindfoot = sum(!is.na(hindfoot_length))
)
year
, genus
, species_id
and weight
.surveys %>%
# remove missing values
filter(!is.na(weight)) %>%
# for each year
group_by(year) %>%
# retain those where the weight value is equal to the maximum weight value
filter(weight == max(weight)) %>%
# select only few columns
select(year, genus, species, weight) %>%
# then sort the table by year
arrange(year)
surveys_complete
, which:
weight
, hindfoot_length
and sex
# First filter away the missing values
surveys_complete <- surveys %>%
filter(!is.na(weight) & !is.na(sex) & !is.na(hindfoot_length))
# Add a column with the count of each species
surveys_complete <- surveys_complete %>%
group_by(species_id) %>%
mutate(n_obs = n())
# Now filter the table so that number of observations is >= 50
surveys_complete <- surveys_complete %>%
filter(n_obs >= 50)
# Or all of the above could have been done in one pipeline
surveys_complete <- surveys %>%
filter(!is.na(weight) & !is.na(sex) & !is.na(hindfoot_length)) %>%
group_by(species_id) %>%
mutate(n_obs = n()) %>%
filter(n_obs >= 50)
# note the course notes shows a different way of doing this:
# https://datacarpentry.org/R-ecology-lesson/03-dplyr.html#exporting_data
The final data frame should have 30463 rows.
data_output
folder of your project directory (use write_csv()
function).write_csv(surveys_complete, "data_output/surveys_complete.csv")
ggplot2
Boxplots are useful summaries, but hide the shape of the distribution. For example, if the distribution is bimodal, we would not see it in a boxplot. An alternative to the boxplot is the violin plot, where the shape (of the density of points) is drawn.
geom_violin()
to create violin plots of weight distributions for each genus
surveys_complete %>%
ggplot(aes(genus, weight)) +
geom_violin()
scale_y_log10()
.surveys_complete %>%
ggplot(aes(genus, weight)) +
geom_violin() +
scale_y_log10()
?geom_boxplot
help)?surveys_complete %>%
ggplot(aes(genus, weight)) +
geom_violin() +
geom_boxplot(width = 0.2, outlier.shape = NA) +
scale_y_log10()
species_id
, with violin plots coloured by genus
.surveys_complete %>%
ggplot(aes(species_id, weight)) +
geom_violin(aes(fill = genus)) +
geom_boxplot(width = 0.2, outlier.shape = NA) +
scale_y_log10()