The gapminder
data set contains demographic statistics popularized by Hans Rosling’s TED talks.
library(gapminder)
gapminder
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414
## # ... with 1,694 more rows
Which countries had the fastest growing GDP’s between 1952 and 2007?
In R, you manipulate data by passing the data to functions.
round(1234.567, digits = 2)
## [1] 1234.57
nrow(gapminder)
## [1] 1704
The tidyverse is a collection of R packages that contain functions. You must load the packages to use the functions.
## install.packages("tidyverse")
library("tidyverse")
Tidyverse functions are designed to be:
arrange(gapminder, desc(pop))
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 2007 72.96100 1318683096 4959.1149
## 2 China Asia 2002 72.02800 1280400000 3119.2809
## 3 China Asia 1997 70.42600 1230075000 2289.2341
## 4 China Asia 1992 68.69000 1164970000 1655.7842
## 5 India Asia 2007 64.69800 1110396331 2452.2104
## 6 China Asia 1987 67.27400 1084035000 1378.9040
## 7 India Asia 2002 62.87900 1034172547 1746.7695
## 8 China Asia 1982 65.52500 1000281000 962.4214
## 9 India Asia 1997 61.76500 959000000 1458.8174
## 10 China Asia 1977 63.96736 943455000 741.2375
## # ... with 1,694 more rows
gapminder2007 <- filter(gapminder, year == 2007)
arrange(gapminder2007, desc(pop))
## # A tibble: 142 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 2007 72.961 1318683096 4959.115
## 2 India Asia 2007 64.698 1110396331 2452.210
## 3 United States Americas 2007 78.242 301139947 42951.653
## 4 Indonesia Asia 2007 70.650 223547000 3540.652
## 5 Brazil Americas 2007 72.390 190010647 9065.801
## 6 Pakistan Asia 2007 65.483 169270617 2605.948
## 7 Bangladesh Asia 2007 64.062 150448339 1391.254
## 8 Nigeria Africa 2007 46.859 135031164 2013.977
## 9 Japan Asia 2007 82.603 127467972 31656.068
## 10 Mexico Americas 2007 76.195 108700891 11977.575
## # ... with 132 more rows
Use the pipe operator (%>%) to compose tidyverse functions.
gapminder %>%
filter(year == 2007) %>%
arrange(desc(pop))
## # A tibble: 142 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 2007 72.961 1318683096 4959.115
## 2 India Asia 2007 64.698 1110396331 2452.210
## 3 United States Americas 2007 78.242 301139947 42951.653
## 4 Indonesia Asia 2007 70.650 223547000 3540.652
## 5 Brazil Americas 2007 72.390 190010647 9065.801
## 6 Pakistan Asia 2007 65.483 169270617 2605.948
## 7 Bangladesh Asia 2007 64.062 150448339 1391.254
## 8 Nigeria Africa 2007 46.859 135031164 2013.977
## 9 Japan Asia 2007 82.603 127467972 31656.068
## 10 Mexico Americas 2007 76.195 108700891 11977.575
## # ... with 132 more rows
gapminder %>%
filter(year == 2007) %>%
arrange(desc(lifeExp)) %>%
select(country, lifeExp)
## # A tibble: 142 x 2
## country lifeExp
## <fctr> <dbl>
## 1 Japan 82.603
## 2 Hong Kong, China 82.208
## 3 Iceland 81.757
## 4 Switzerland 81.701
## 5 Australia 81.235
## 6 Spain 80.941
## 7 Sweden 80.884
## 8 Israel 80.745
## 9 France 80.657
## 10 Canada 80.653
## # ... with 132 more rows
gapminder %>%
mutate(gdp = pop * gdpPercap)
## # A tibble: 1,704 x 7
## country continent year lifeExp pop gdpPercap gdp
## <fctr> <fctr> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453 6567086330
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530 7585448670
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007 8758855797
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971 9648014150
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811 9678553274
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134 11697659231
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114 12598563401
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959 11820990309
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414 10595901589
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414 14121995875
## # ... with 1,694 more rows
gapminder %>%
mutate(gdp = pop * gdpPercap) %>%
summarise(max_gdp = max(gdp))
## # A tibble: 1 x 1
## max_gdp
## <dbl>
## 1 1.293446e+13
gapminder %>%
mutate(gdp = pop * gdpPercap) %>%
summarise(first_gdp = first(gdp), last_gdp = last(gdp))
## # A tibble: 1 x 2
## first_gdp last_gdp
## <dbl> <dbl>
## 1 6567086330 5782658337
gapminder %>%
mutate(gdp = pop * gdpPercap) %>%
group_by(country) %>%
summarise(first_gdp = first(gdp), last_gdp = last(gdp))
## # A tibble: 142 x 3
## country first_gdp last_gdp
## <fctr> <dbl> <dbl>
## 1 Afghanistan 6567086330 31079291949
## 2 Albania 2053669902 21376411360
## 3 Algeria 22725632678 207444851958
## 4 Angola 14899557133 59583895818
## 5 Argentina 105676319105 515033625357
## 6 Australia 87256254102 703658358894
## 7 Austria 42516266683 296229400691
## 8 Bahrain 1188460759 21112675360
## 9 Bangladesh 32082059995 209311822134
## 10 Belgium 72838686716 350141166520
## # ... with 132 more rows
gapminder %>%
mutate(gdp = pop * gdpPercap) %>%
group_by(country) %>%
summarise(gdp1952 = first(gdp), gdp2007 = last(gdp)) %>%
mutate(cagr = ((gdp2007 / gdp1952) ^ (1/55) - 1) * 100) %>%
arrange(desc(cagr)) %>%
select(country, cagr)
## # A tibble: 142 x 2
## country cagr
## <fctr> <dbl>
## 1 Singapore 8.348304
## 2 Equatorial Guinea 8.346729
## 3 Oman 8.218950
## 4 Taiwan 7.869795
## 5 Botswana 7.548170
## 6 Korea, Rep. 7.487215
## 7 Hong Kong, China 7.064369
## 8 Thailand 6.384445
## 9 Libya 6.372590
## 10 China 6.337334
## # ... with 132 more rows
Each tidyverse function expects and returns the same type of data: tidy data. A tabular data set is tidy iff:
Let’s focus on the 10 biggest economies (in 1952). What are they?
gapminder %>%
filter(year == 1952) %>%
mutate(gdp = pop * gdpPercap) %>%
arrange(desc(gdp)) %>%
select(country, gdp)
## # A tibble: 142 x 2
## country gdp
## <fctr> <dbl>
## 1 United States 2.204242e+12
## 2 United Kingdom 5.032666e+11
## 3 Germany 4.939866e+11
## 4 France 2.984834e+11
## 5 Japan 2.781349e+11
## 6 Italy 2.350603e+11
## 7 China 2.227550e+11
## 8 India 2.033225e+11
## 9 Canada 1.680701e+11
## 10 Brazil 1.193716e+11
## # ... with 132 more rows
top_10 <- c("United States", "United Kingdom", "Germany", "France",
"Japan", "Italy", "China", "India", "Canada", "Brazil")
ggplot(data = <DATA>) +
<GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
gapminder %>%
filter(country %in% top_10) %>%
mutate(gdp = pop * gdpPercap) %>%
ggplot() +
geom_line(mapping = aes(x = year, y = gdp, color = country))
Let’s scale the data within each country to make growth easier to compare
gapminder %>%
filter(country %in% top_10) %>%
mutate(gdp = pop * gdpPercap)
## # A tibble: 120 x 7
## country continent year lifeExp pop gdpPercap gdp
## <fctr> <fctr> <int> <dbl> <int> <dbl> <dbl>
## 1 Brazil Americas 1952 50.917 56602560 2108.944 1.193716e+11
## 2 Brazil Americas 1957 53.285 65551171 2487.366 1.630498e+11
## 3 Brazil Americas 1962 55.665 76039390 3336.586 2.537119e+11
## 4 Brazil Americas 1967 57.632 88049823 3429.864 3.019989e+11
## 5 Brazil Americas 1972 59.504 100840058 4985.711 5.027594e+11
## 6 Brazil Americas 1977 61.489 114313951 6660.119 7.613445e+11
## 7 Brazil Americas 1982 63.336 128962939 7030.836 9.067173e+11
## 8 Brazil Americas 1987 65.205 142938076 7807.096 1.115931e+12
## 9 Brazil Americas 1992 67.057 155975974 6950.283 1.084077e+12
## 10 Brazil Americas 1997 69.388 168546719 7957.981 1.341292e+12
## # ... with 110 more rows
gapminder %>%
filter(country %in% top_10) %>%
mutate(gdp = pop * gdpPercap) %>%
group_by(country) %>%
mutate(scaled_gdp = gdp / first(gdp)) %>%
ggplot() +
geom_line(mapping = aes(x = year, y = scaled_gdp, color = country))
gapminder %>%
filter(country %in% top_10) %>%
mutate(gdp = pop * gdpPercap) %>%
group_by(country) %>%
summarise(start = first(gdp), end = last(gdp)) %>%
mutate(cagr = ((end/start) ^ (1 / 55) - 1) * 100) %>%
arrange(desc(cagr)) %>%
select(country, cagr)
## # A tibble: 10 x 2
## country cagr
## <fctr> <dbl>
## 1 China 6.337334
## 2 Japan 4.983258
## 3 Brazil 4.973063
## 4 India 4.830628
## 5 Canada 3.658473
## 6 Italy 3.619402
## 7 France 3.383767
## 8 United States 3.269607
## 9 Germany 3.101929
## 10 United Kingdom 2.557105
gapminder %>%
filter(country %in% top_10) %>%
mutate(gdp = pop * gdpPercap) %>%
group_by(country) %>%
summarise(start = first(gdp), end = last(gdp)) %>%
mutate(cagr = ((end/start) ^ (1 / 55) - 1) * 100) %>%
arrange(desc(cagr)) %>%
select(country, cagr) %>%
ggplot() +
geom_col(mapping = aes(x = country, y = cagr))
library(forcats)
gapminder %>%
filter(country %in% top_10) %>%
mutate(gdp = pop * gdpPercap) %>%
group_by(country) %>%
summarise(start = first(gdp), end = last(gdp)) %>%
mutate(cagr = ((end/start) ^ (1 / 55) - 1) * 100) %>%
arrange(desc(cagr)) %>%
ggplot() +
geom_col(mapping = aes(x = fct_reorder(country, cagr), y = cagr)) +
labs(x = "Country", y = "Cumulative Annual Growth Rate in GDP (1952-2007)") +
coord_flip()