Geospatial Analysis with Python and R

lesson 2

# This is my first code

# Printing a welcome message to the console
print("Welcome to R! R is fun!")
[1] "Welcome to R! R is fun!"
# Checking the complete version of the installed R
R.version.string  # Print the R version string
[1] "R version 4.3.1 (2023-06-16)"
# Using R as a calculator
sum = 3 + 5  # Perform addition

# Print the result
print(paste("The sum of the numbers is:", sum))  # Display the sum
[1] "The sum of the numbers is: 8"
# Create a basic scatter plot
plot(1:10)
# Create a sub-folder with in geoptyon2023 folder
dir.create("C:/Users/yourname/geopython2023/R_01")
# Check the current working directory 
getwd() 

# Check the files in the directory
dir() 
setwd("Your/Folder/Path")
# Assigning a value to an object 
object_name <- value

# Alternatively, you can use '=' for assignment
object_name = value
# Create a numeric data object
my_num <- 3.14
my_num
[1] 3.14
# Check the class name of variable
class(my_num)
[1] "numeric"
# Check the type of variable
typeof(my_num)
[1] "double"
# Check if it's numeric 
is.numeric(my_num)
[1] TRUE
# Create an integer object 
my_int <- 2

# Check the class 
class(my_int)
[1] "numeric"
# Create a complex number
my_complex <- 3 + 2i # 2i represents imaginary part

# Check the class 
class(my_complex)
[1] "complex"
# Create a character data object
my_char <- "Hello, World!"

# Check the class 
class(my_char)
[1] "character"
# Enclosing numbers within quotes also returns character
my_char2 <- "3.5"

# Check the class 
class(my_char2)
[1] "character"
# Create a logical data object
my_logic <- T

# Check the class 
class(my_logic)
[1] "logical"
# Define a date string
my_dt_char <- "2023-10-31"  

# Convert the date string into a Date object
my_dt_date <- as.Date(my_dt_char)  
class(my_dt_date)
[1] "Date"
# Defines the date format
my_dt_formated <- format(my_dt_date, "%B %d, %Y")  # Format the date as "Month Day, Year"
my_dt_formated
[1] "October 31, 2023"
# Create a raw vector representing ASCII values for "I love R!"
raw_R <- as.raw(c(73, 32, 76, 111, 118, 101, 32, 82, 33))
raw_R
[1] 49 20 4c 6f 76 65 20 52 21
# Convert the raw data to a character string
string_R <- rawToChar(raw_R)
string_R
[1] "I Love R!"
# Convert numeric to character
num_char <- as.character(my_num)
class(num_char)
[1] "character"
# Convert character to numeric
char_num <- as.numeric(my_char2)
class(char_num)
[1] "numeric"
# Numeric vector
my_vector1 <- c(1, 3, 5, 7, 9)

# Check if "my_vector1" is a vector (should return TRUE)
is.vector(my_vector1)
[1] TRUE
# Character vector
my_vector2 <- c("A", "B", "c", "D", "E", "F")

class(my_vector1)
[1] "numeric"
# subsets elements at 2nd and 4th positions 
subset_vector <- my_vector1[c(2, 4)]  
subset_vector
[1] 3 7
# Create a list with different elements
my_list <- list(c("Red", "Green"), c(21,32,11), TRUE, 51.23, 119.1)
my_list
[[1]]
[1] "Red"   "Green"

[[2]]
[1] 21 32 11

[[3]]
[1] TRUE

[[4]]
[1] 51.23

[[5]]
[1] 119.1
# Convert the lists to vectors using "unlist()" function
list_vec <- unlist(my_list)
list_vec
[1] "Red"   "Green" "21"    "32"    "11"    "TRUE"  "51.23" "119.1"
my_matrix  <-  matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), # data elements
                      nrow = 3, ncol = 3,  # dimensions (no. of rows and columns)
                      byrow = TRUE) # by default matrices are in column-wise order. 

my_matrix
     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    4    5    6
[3,]    7    8    9
# Create a data frame named 'my_df' with columns for id, landuse, description, and year_built
my_df <- data.frame(
  ID = c(101, 102, 103, 104),
  Landuse = c('Residential', 'Commercial', 'Industrial', 'Park'),
  Description = c(
    'Residential zone',
    'Commercial zone',
    'Industrial zone',
    'Public park zone'),
  Year = c(1990, 2005, 1985, 1970)
)

# Display the 'my_df'
my_df
   ID     Landuse      Description Year
1 101 Residential Residential zone 1990
2 102  Commercial  Commercial zone 2005
3 103  Industrial  Industrial zone 1985
4 104        Park Public park zone 1970
# Check the class of 'my_df'
class(my_df)
[1] "data.frame"
# Create a data frame called my_df2
my_df2 <- data.frame(
  Area = c(450, 200, 150, 75)
)

# Combine my_df and my_df2 by columns
my_df3 <- cbind(my_df, my_df2)

# Combine df and df2 by rows
my_df4 <- plyr::rbind.fill(my_df, my_df2) # ?plyr::rbind.fill
# Access a column containing "Landuse" using the $ operator
my_df3$Landuse
[1] "Residential" "Commercial"  "Industrial"  "Park"       
# Access the cell in the third row and fourth column
my_df3[3, 4]  
[1] 1985
# Filter rows where Area is equal to 200
my_df3[my_df3$Area == 200,]  
   ID    Landuse     Description Year Area
2 102 Commercial Commercial zone 2005  200
# Filter rows where Area is between 100 and 200
my_df3[my_df3$Area > 100 & my_df3$Area < 200,]  
   ID    Landuse     Description Year Area
3 103 Industrial Industrial zone 1985  150
# Filter rows where Area is either greater than 100 or less than 200
my_df3[my_df3$Area > 100 | my_df3$Area < 200,]  
   ID     Landuse      Description Year Area
1 101 Residential Residential zone 1990  450
2 102  Commercial  Commercial zone 2005  200
3 103  Industrial  Industrial zone 1985  150
4 104        Park Public park zone 1970   75
# Note: a conditional statement based on the & (logical and) operator requires that both conditions are met, however, the | (logical or) operator requires that at least one of the conditions is met.
# Convert 'Landuse' column to a factor 
my_factor <- factor(my_df3$Landuse)

# Display the attributes 
attributes(my_factor)
$levels
[1] "Commercial"  "Industrial"  "Park"        "Residential"

$class
[1] "factor"
# Convert my_matrix to a data frame
df5 <- as.data.frame(my_matrix)

# Check the class of the resulting data frame (should return "data.frame")
class(df5)
[1] "data.frame"
# Install single package
install.packages("Package Name")  # Replace ""Package Name"" with the actual package name

# Install multiple packages
install.packages(c("Package Name1","Package Name2", "Package Name3"))

# Note that Package names are case sensitive!
install.packages("remotes")
# Install the package from GitHub
remotes::install_github("github_username/packageName")  # replace "github_username" with the actual GitHub username and "packageName" with the name of the package you want to install
remotes::install_version("package Name", "version")
# Load the specified package (replace "package Name" with the actual package name)
library("package Name")
.libPaths()
[1] "/home/geoadmin/R/x86_64-pc-linux-gnu-library/4.3"
[2] "/usr/local/lib/R/site-library"                   
[3] "/usr/lib/R/site-library"                         
[4] "/usr/lib/R/library"                              
# List all installed packages
installed_packages <- installed.packages()

# Create a data frame with only "Package" and "Version" columns
subset_installed_packages <- installed_packages[, c("Package", "Version")]

# To view the list of loaded (or attached) packages during an R session
search()
# Install packages
 # install.packages("tidyverse")
 # install.packages("sf")

# Load packages
library(sf) 
library(tidyverse) 

#Please be aware that when installing R packages, you should enclose the package names in quotation marks, whereas when loading packages, no quotation marks are needed.
# Import settlements data
geometry <- st_read("settlements.gpkg")  
Reading layer `geometry' from data source 
  `/home/geoadmin/dev/simply-general/teaching/geospatial_python_and_r/R_01/settlements.gpkg' 
  using driver `GPKG'
Simple feature collection with 4713 features and 1 field
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 369032.1 ymin: 6377141 xmax: 739152.8 ymax: 6634019
Projected CRS: Estonian Coordinate System of 1997
# Determine the data class of the "geometry" object 
class(geometry)
[1] "sf"         "data.frame"
# Display the structure 
glimpse(geometry)   
Rows: 4,713
Columns: 2
$ KOOD <chr> "9599", "1885", "3050", "2654", "9010", "1846", "1081", "4193", "…
$ geom <MULTIPOLYGON [m]> MULTIPOLYGON (((693596.1 64..., MULTIPOLYGON (((6884…
plot(geometry$geom)
# Import population data 
pop <- read.csv("population.csv")

#The "KOOD" variable in the population data comprises four-digit numbers. 
# However, during the CSV saving process, elements starting with leading zeros were automatically removed for certain reasons. 
# The next code appends leading zeros to the "KOOD" values that consist of only three digits.
pop$KOOD <- str_pad(pop$KOOD, width = 4, side = "left", pad = "0")

# Display a structured summary
glimpse(pop)
Rows: 4,713
Columns: 7
$ VID        <int> 66016912, 66013199, 66013544, 66013330, 66016492, 66013175,…
$ KOOD       <chr> "9599", "1885", "3050", "2654", "9010", "1846", "1081", "41…
$ NIMI       <chr> "Võuküla", "Holvandi küla", "Kiisa küla", "Kanassaare küla"…
$ VAARTUS    <chr> "60", "46", "74", "54", "65", "501", "100", "146", "239", "…
$ STAMP_CRE  <chr> "1/25/2021", "1/25/2021", "1/25/2021", "1/25/2021", "1/25/2…
$ JUHUSLIK   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ GEOKODEER8 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Join pop and geometry based on "KOOD" field
pop_geom_joined <- left_join(pop, geometry, by = "KOOD")
# Convert KOOD from integer to character type
pop <- pop %>% 
  mutate(KOOD = as.character(KOOD))

# join again
pop_geom_joined <- left_join(pop, geometry, by = "KOOD")
# Provides a structured summary of your data
glimpse(pop_geom_joined)

# colnames() retrieves the column names in your data
colnames(pop_geom_joined)

# summary() provides a summary of the main descriptive statistics for each variable in your data
summary(pop_geom_joined)

# dim() returns the dimensions (number of rows and columns) of your data
dim(pop_geom_joined)

# Checks and displays the class of your data
class(pop_geom_joined)
# Deleting "VID" column using the $ operator
pop_geom_joined$VID <- NULL

# Deleting multiple columns using the select() function and %>% operator
pop_geom_joined <- pop_geom_joined %>%  # Overwriting the existing object
  dplyr::select(-GEOKODEER8, -JUHUSLIK, -STAMP_CRE)
# Create a new data frame, pop_geom_joined_renamed, by renaming columns from pop_geom_joined
pop_geom_joined_renamed <- pop_geom_joined %>%  
  rename(Settlements = NIMI,   # rename the "NIMI" column to "Settlements,
         Population= VAARTUS,  # rename the "VAARTUS" to "Population
         Code = KOOD)     # rename the "KOOD" to "Code"    
glimpse(pop_geom_joined_renamed)
Rows: 4,713
Columns: 4
$ Code        <chr> "9599", "1885", "3050", "2654", "9010", "1846", "1081", "4…
$ Settlements <chr> "Võuküla", "Holvandi küla", "Kiisa küla", "Kanassaare küla…
$ Population  <chr> "60", "46", "74", "54", "65", "501", "100", "146", "239", …
$ geom        <MULTIPOLYGON [m]> MULTIPOLYGON (((693596.1 64..., MULTIPOLYGON …
# Create a new data frame 'pop_new' based on 'pop_geom_joined_renamed' and convert the 'Population' column to numeric data type.
pop_new <- pop_geom_joined_renamed %>%
  mutate(Population = as.numeric(Population))

# Display a summary of the structure of the 'pop_new' data frame.
glimpse(pop_new)
Rows: 4,713
Columns: 4
$ Code        <chr> "9599", "1885", "3050", "2654", "9010", "1846", "1081", "4…
$ Settlements <chr> "Võuküla", "Holvandi küla", "Kiisa küla", "Kanassaare küla…
$ Population  <dbl> 60, 46, 74, 54, 65, 501, 100, 146, 239, 51, 65, 55, 55, 51…
$ geom        <MULTIPOLYGON [m]> MULTIPOLYGON (((693596.1 64..., MULTIPOLYGON …
summary(pop_new$Population)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
     4.0     19.0     40.0    299.2     84.0 116153.0      282 
# Check if "pop_geom_joined_renamed" contains any missing values
anyNA(pop_geom_joined_renamed) 
[1] FALSE
# What about "pop_new"?
anyNA(pop_new) 
[1] TRUE
# Total number of NAs
sum(is.na(pop_new$Population))
[1] 282
View(pop_geom_joined_renamed)
# Eliminate missing values from the "pop_new" data
na_excl <- na.exclude(pop_new)
na_drop <- drop_na(pop_new)

# Check if "na_drop" contains any missing values
anyNA(na_drop) 
[1] FALSE
# Fill missing values in "Population" column with value 2 
pop_new_filled <- pop_new %>%
  mutate(Population = coalesce(Population, 2))  

# Check if the NAs has been successfully replaced
sum(is.na(pop_new_filled$Population))
[1] 0
# Find the minimum value
min(pop_new_filled$Population) 
[1] 2
# Find the maximum value
max(pop_new_filled$Population) 
[1] 116153
# Calculate the standard deviation 
sd(pop_new_filled$Population) 
[1] 3115.68
# Calculate the median 
median(pop_new_filled$Population)
[1] 36
# Generate a summary of descriptive statistics 
summary(pop_new_filled$Population) 
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     2.0     16.0     36.0    281.4     80.0 116153.0 
# Determine the interquartile range (IQR) 
IQR(pop_new_filled$Population)
[1] 64
# Calculate specific percentiles (25th, 50th, and 75th) 
quantile(pop_new_filled$Population, probs = c(0.25, 0.50, 0.75))
25% 50% 75% 
 16  36  80 
hist(pop_new_filled$Population)
# Filter and arrange population
pop_2000 <- pop_new_filled %>%
  filter(Population > 2000) %>% 
  arrange(desc(Population))

slice(pop_2000, 1:5) # slice() function is used to select specific rows from a data frame, rows 1 to 5 in this case
# Filter and sort data to create a new data frame called "cities15000"
cities15000 <- pop_new_filled %>%
  filter(str_detect(Settlements, "linn$")) %>% 
  filter(Population > 15000) %>%
  arrange(Settlements)
# the "$" specifies that "linn" should occur at the end of the "Settlements" strings

# Display the "cities15000" data frame
cities15000
# Filter the data frame to include only specific cities
top3_cities_1 <- pop_new_filled %>% 
  filter(Settlements == "Tartu linn" |  
         Settlements == "Narva linn" | 
         Settlements == "Pärnu linn")

# Alternatively, use "%in%" operator
top3_cities_2 <- pop_new_filled %>%
  filter(Settlements %in% c("Tartu linn", "Narva linn", "Pärnu linn"))

top3_cities_1
# Convert the data frame "pop_new_filled" to sf (spatial) format
pop_new_filled_sf <- st_as_sf(pop_new_filled, crs = 3301)

# Display a summary of the new object
glimpse(pop_new_filled_sf)
Rows: 4,713
Columns: 4
$ Code        <chr> "9599", "1885", "3050", "2654", "9010", "1846", "1081", "4…
$ Settlements <chr> "Võuküla", "Holvandi küla", "Kiisa küla", "Kanassaare küla…
$ Population  <dbl> 60, 46, 74, 54, 65, 501, 100, 146, 239, 51, 65, 55, 55, 51…
$ geom        <MULTIPOLYGON [m]> MULTIPOLYGON (((693596.1 64..., MULTIPOLYGON …
# Create a ggplot object and add a "pop_new_filled_sf" layer with population data
my_plot <- ggplot() +
  geom_sf(data = pop_new_filled_sf, 
          aes(fill = cut(Population,  
                         breaks = c(0, 50, 100, 150, 200, 120000), 
                         labels = c("0-50", "51-100", "101-150", "151-200", "200+"))))+
  # Customize the legend and add titles
  labs(fill = "Population per settlement", 
       title = "Population Distribution in Estonia",
       subtitle = "Data source: Statistics of Estonia") +
  # Apply a color scale using a palette from RColorBrewer
  scale_fill_brewer(palette = "YlOrRd") +  # You can change "YlOrRd" to any other palette name
  # Improve the overall theme for better visualization
  theme_grey() +
  theme(legend.position = "right")   # this is position also default position

# For more RColorBrewer options, please visit https://r-graph-gallery.com/38-rcolorbrewers-palettes.html

# Display the plot
my_plot
# Save the sf data
st_write(pop_new_filled_sf,  "population.gpkg",  append = TRUE)

# Save the ggplot figure to a png file with custom width and height
ggsave("my_plot.png", plot = my_plot, width = 6, height = 4)

# Please note that this function saves the file directly to the working directory by default. If you wish to save it in a different directory, you can specify the desired path.

Lesson 1: Introduction to R

Learning outcomes

1.1. Setting up the R environment

1.2. Getting started with R

1.3. Creating objects

1.4. Data types and data structures

1.4.1. Data types

1.4.2. Data structures

1.5. R Packages

1.6. Data wrangling and manipulation

1.6.1. Attribute join

1.6.2. Handling missing data

How to get help in R?

Reference Materials