import numpy as np
import pandas as pd
import geopandas as gpd
# import pysal -> typically you now import directly mapclassify or pointpats or esda ...
import libpysal as lp
import esda
import os
import seaborn as sns
import contextily
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.geometry import Polygon
from sklearn.cluster import DBSCAN
import cartopy.crs as ccrs
%matplotlib inline
Point pattern analysis
Point pattern analysis is a set of statistical techniques used to study the spatial distribution of point events. It aims to determine if points are clustered, dispersed, or randomly distributed, and to identify the factors influencing their spatial arrangement. It’s used in diverse fields like ecology, epidemiology, criminology, and geography to analyze phenomena like disease outbreaks, crime hotspots, or species distributions.
Three main methods:
- Centrography
- Density based metrics
- Distance based metrics
We will use Tartu bike system GPS data for the analysis. The data originates from Tartu City Government and landscape fire data from Estonian Rescue Board
Download a Zipfile here.
= pd.read_csv('rattaringluse_gps_valjavote_final.csv', encoding='utf8')
points 3) points.head(
cyclenumber | time_stamp | second | unix_epoch | X | Y | date | time | |
---|---|---|---|---|---|---|---|---|
0 | 2045 | 19/07/2019 07:02 | 0 | 1563519770 | 26.720922 | 58.380096 | 19/07/2019 | 07:02:50 |
1 | 2045 | 19/07/2019 07:41 | 0 | 1563522110 | 26.720424 | 58.380070 | 19/07/2019 | 07:41:50 |
2 | 2045 | 19/07/2019 08:11 | 0 | 1563523880 | 26.720045 | 58.380042 | 19/07/2019 | 08:11:20 |
len(points)
5379
We will check how many unique different bikes’ points are in the dataset.
# Get unique cycle numbers
= points['cyclenumber'].unique().tolist()
unique_cycles
# Get the count of unique cycle numbers
= len(unique_cycles)
num_unique_cycles
# Print the results
print(f"Unique cycle numbers: {unique_cycles}")
print(f"Number of unique cycle numbers: {num_unique_cycles}")
Unique cycle numbers: [2045, 2056, 2058, 2064, 2068, 2070, 2090, 2093, 2098, 2103, 2104, 2119, 2124, 2126, 2128, 2145, 2154, 2166, 2169, 2171, 2172, 2174, 2178, 2185, 2199, 2207, 2209, 2211, 2220, 2227, 2229, 2232, 2233, 2239, 2246, 2259, 2260, 2267, 2275, 2306, 2321, 2324, 2329, 2344, 2352, 2356, 2365, 2380, 2394, 2396, 2417, 2423, 2428, 2432, 2433, 2446, 2448, 2454, 2458, 2468, 2478, 2485, 2494, 2495, 2504, 2516, 2522, 2524, 2526, 2528, 2545, 2564, 2566, 2580, 2586, 2602, 2608, 2609, 2613, 2616, 2620, 2625, 2641, 2645, 2646, 2655, 2662, 2665, 2672, 2684, 2687, 2711, 2735, 2740, 2742, 2749, 2752, 2784]
Number of unique cycle numbers: 98
Now, we can check the time period for the dataset. The time is in column unix_epoch. Unix time is a date and time representation widely used in computing. It measures time by the number of non-leap seconds that have elapsed since 00:00:00 UTC on 1 January 1970 (Wiki).
# Calculate duration in hours
= (points['unix_epoch'].max() - points['unix_epoch'].min())/ (60 * 60)
duration
# Print the result
print(f"Duration (hours): {duration}")
Duration (hours): 1.9944444444444445
# Make geometry column from X and Y columns
def create_geometry_from_xy(df, x_col='x', y_col='y', crs="EPSG:4326"):
try:
= [Point(xy) for xy in zip(df[x_col], df[y_col])]
geometry = gpd.GeoDataFrame(df, geometry=geometry, crs=crs)
gdf return gdf
except KeyError as e:
print(f"Error: Column '{e.args[0]}' not found in DataFrame.")
return None
except Exception as e:
print(f"An unexpected error occurred: {e}")
return None
= create_geometry_from_xy(points, x_col='X', y_col='Y', crs="EPSG:4326")
bike_points 3) bike_points.head(
cyclenumber | time_stamp | second | unix_epoch | X | Y | date | time | geometry | |
---|---|---|---|---|---|---|---|---|---|
0 | 2045 | 19/07/2019 07:02 | 0 | 1563519770 | 26.720922 | 58.380096 | 19/07/2019 | 07:02:50 | POINT (26.72092 58.3801) |
1 | 2045 | 19/07/2019 07:41 | 0 | 1563522110 | 26.720424 | 58.380070 | 19/07/2019 | 07:41:50 | POINT (26.72042 58.38007) |
2 | 2045 | 19/07/2019 08:11 | 0 | 1563523880 | 26.720045 | 58.380042 | 19/07/2019 | 08:11:20 | POINT (26.72005 58.38004) |
='o', color="red", markersize=1.0) bike_points.plot(marker
#Lets plot the gps points of each bike with different color
= bike_points['cyclenumber'].unique()
unique_cycles = plt.Normalize(vmin=bike_points['cyclenumber'].min(), vmax=bike_points['cyclenumber'].max())
norm
= plt.subplots(1, 1)
fig, ax
for cycle in unique_cycles:
= bike_points[bike_points['cyclenumber'] == cycle]
cycle_points = plt.cm.tab20(norm(cycle))
color =ax, marker='o', color=color, markersize=1.0, label=f'Cycle {cycle}')
cycle_points.plot(ax#ax.legend(title='cyclenumber')
plt.show()
Centrography
Centrography is a set of spatial statistical techniques used to describe the central tendency and dispersion of point patterns or geographic distributions. It provides quantitative measures of central tendency and dispersion:
Mean center: The average location of the points.
Median center: The point that minimizes the total distance to all other points.
Standard deviational ellipse: A representation of the spatial spread and orientation of the points.
We will first calculate mean and median center of the gps points.
from pointpats import centrography
from matplotlib.patches import Ellipse
#calculate spatial mean and median of the points
= centrography.mean_center(bike_points[["X", "Y"]])
mean_center = centrography.euclidean_median(bike_points[["X", "Y"]])
med_center
print ("Mean center is", mean_center)
print ("Median center is", med_center)
Mean center is [26.72938143 58.37326066]
Median center is [26.72919961 58.37584394]
In point pattern analysis, the standard deviational ellipse (SDE) provides a visual and statistical representation of the spatial dispersion and orientation of a set of points. The size of the ellipse indicates the overall spread of the point pattern. A larger ellipse signifies greater dispersion, while a smaller ellipse suggests a more concentrated pattern. The angle of the ellipse’s major axis reveals the primary direction or orientation of the point distribution. It shows the trend or elongation of the pattern
# calculate standard ellipse
= centrography.ellipse(bike_points[["X", "Y"]]) major, minor, rotation
# Create the base for the figure - subplots and size
= plt.subplots(1, figsize=(10, 10))
fig, ax 'equal') #this important to keep the proportions of the projected map correct!
ax.set_aspect(# Plot bike points
"X"], bike_points["Y"], s=0.2)
ax.scatter(bike_points[*mean_center, color="red", marker="D", label="Mean center", zorder=2)
ax.scatter(
ax.scatter(*med_center, marker="*", color="purple", label="Median center", zorder=2
)# Construct the standard ellipse using matplotlib
= Ellipse(
ellipse =mean_center, # center the ellipse to the mean center
xy=major * 2, # centrography.ellipse gives only half of the axis and we need to multiply with 2
width=minor * 2,
height=np.rad2deg(rotation), # Centrography gives angle in radians. Convert radians to degrees
angle="darkred",
facecolor="darkred",
edgecolor=2,
linewidth="-.",
linestyle=0.3,
alpha="Standard ellipse",
label=1,
zorder
)
ax.add_patch(ellipse)
ax.legend()
# Add basemap
contextily.add_basemap(
ax, ="EPSG:4326",
crs=contextily.providers.CartoDB.Voyager
source
) plt.show()
Convex hull is the smallest convex polygon that encloses all the points in a dataset. It effectively creates a “boundary” around the point distribution, showing the outermost extent of the pattern. It’s used to define and visualise the spatial extent but also provides a way to identify outliers. Points on the hull are the most extreme, potentially highlighting outliers.
Let’s compare two different bikes’ convex hulls. First we have to extract two different bikes from the initial dataframe. Let’s choose bikes number 2735 and 2524
#Extract bike number 2375 data
= bike_points[bike_points['cyclenumber'] == 2735]
bike_1 = bike_1[["X", "Y"]].values
coords_1
#Extract bike number 2119 data
= bike_points[bike_points['cyclenumber'] == 2119]
bike_2 = bike_2[["X", "Y"]].values coords_2
Next we need to calculate coordinates of the convex hulls based on the bikes’ coordinates. For that we can use centrography.hull method.
#calculate convex hull coordinates for bike 2735
= centrography.hull(coords_1)
convex_hull_coords1
#calculate convex hull coordinates for bike 2524
= centrography.hull(coords_2) convex_hull_coords2
Even more accurate way to represent the shape of a point cloud is alpha shape. An alpha shape provides a way to create more detailed and potentially concave boundaries around a point cloud. Alpha shape can have concave parts while convex hull can’t. In fact, every convex hull is an alpha shape, but not every alpha shape is a convex hull.
#calculate aplha shapes for bikes 2735 and 2119
= lp.cg.alpha_shape_auto(coords_1)
alpha_shape1 = lp.cg.alpha_shape_auto(coords_2) alpha_shape2
Let’s add alpha shapes and convext hulls together with the bikes’ gps points to the map.
= plt.subplots(1, figsize=(10, 10))
fig, ax 'equal')
ax.set_aspect(# Add bike 2735 gps points
ax.scatter(*coords_1.T, color="blue", marker=".", label="Bike 2735"
)
# Add bike 2119 gps points
ax.scatter(*coords_2.T, color="red", marker=".", label="Bike 2119"
)
# Add bike 2735 alpha shape
gpd.GeoSeries(
[alpha_shape1]
).plot(=ax,
ax="blue",
edgecolor="blue",
facecolor=0.1,
alpha="Alpha shape of bike 2735"
label
)
# Add bike 2735 convex hull
ax.add_patch(
plt.Polygon(
convex_hull_coords1,=True,
closed="blue",
edgecolor="none",
facecolor="--",
linestyle=1,
linewidth="Convex Hull"
label
)
)
# Add bike 2119 alpha shape
gpd.GeoSeries(
[alpha_shape2]
).plot(=ax,
ax="red",
edgecolor="red",
facecolor=0.1,
alpha="Alpha shape of bike 2119",
label
)# Add bike 2119 convex hull
ax.add_patch(
plt.Polygon(
convex_hull_coords2,=True,
closed="red",
edgecolor="none",
facecolor="--",
linestyle=1,
linewidth="Convex Hull2"
label
)
)# Add basemap
contextily.add_basemap(=contextily.providers.CartoDB.Voyager,
ax, source="EPSG:4326"
crs
) plt.legend()
/tmp/ipykernel_426/3967103221.py:65: UserWarning: Legend does not support handles for PatchCollection instances.
See: https://matplotlib.org/stable/tutorials/intermediate/legend_guide.html#implementing-a-custom-legend-handler
plt.legend()
Visually, we can see that these two convex hulls don’t overlap. To be sure, let’s check with intersect.
#make shapely polygons from the convex hull coordinates
= Polygon(convex_hull_coords1)
p1 = Polygon(convex_hull_coords2)
p2
#test intersection
print(p1.intersects(p2))
False
Density based metrics
Density based techniques characterize the pattern in terms of its distribution in the study area. The two most commonly used density-based measures are quadrat density and kernel density.
Quadrat density
the study area is divided into smaller sub-regions (i.e., quadrats), and then the point density is computed for each sub-region by dividing the number of points in each quadrat by the quadrat’s area. Quadrats can take on many different shapes such as hexagons and triangles. Lets create a hexagon binning for the gps points.
= plt.subplots(1, figsize=(12,10))
fig, ax 'equal')
ax.set_aspect(= bike_points['X']
X = bike_points['Y']
Y
hex= ax.hexbin(X,
Y, =30, #sets the size of the hexagons. Try to change it
gridsize='plasma',
cmap=0.5,
alpha=0)
linewidths
hex,label='Count of points')
plt.colorbar('Bike gps point density in hexagons')
plt.title('Longitude')
plt.xlabel('Latitude')
plt.ylabel(
# Add basemap
contextily.add_basemap(=contextily.providers.OpenStreetMap.Mapnik,
ax, source="EPSG:4326"
crs )
Kernel density
Kernel density uses a moving window, which is defined by a kernel. The kernel density approach generates a grid of density values whose cell size is smaller than the kernel window. Each cell is assigned the density value computed for the kernel window centered on that cell. Instead of binning data into discrete bins like hexagons, KDE (Kernel Density Estimation) plots create a smooth, continuous curve representing the data’s estimated probability density. Common kernels include the Gaussian (normal) kernel. A kernel is placed over each data point, and these kernels are summed to create the overall density estimate. KDE plots provide a smoother representation of the data’s distribution compared to quadrats, which can be affected by the choice of bin size. This smoothness can reveal underlying patterns that might be obscured in quadrats.
Let’s create kernel density estimation for our bike points. For this, we will use seaborn.kdeplot.
= plt.subplots(1, figsize=(12,10))
fig, ax 'equal')
ax.set_aspect(
# fig, ax = plt.subplots(1, figsize=(12,10))
sns.kdeplot(=bike_points['X'],
x=bike_points['Y'],
y=True,
fill='viridis',
cmap=20, #Number of contour levels or values to draw contours at. Try to change this value.
levels=0.7,
alpha=ax
ax
)'Kernel density of bike GPS points')
plt.title('Longitude')
plt.xlabel('Latitude')
plt.ylabel(
# Add basemap
contextily.add_basemap(=contextily.providers.CartoDB.Voyager,
ax, source="EPSG:4326"
crs
)
#Add additional basemap consisting only labels. As the KDE covers some relevant information like labels, we can add another layer consisting only of labels.
contextily.add_basemap(=contextily.providers.CartoDB.PositronOnlyLabels,
ax, source="EPSG:4326"
crs )
DBSCAN
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a clustering algorithm that groups points that are closely packed together (points with many nearby neighbours), marking as outliers points that lie alone in low-density regions (whose nearest neighbours are too far away). This algorithm is good for data which contains clusters of similar density.
Two most essential parameters in DBSCAN are eps
and min_samples
.
eps
(epsilon) determines how close points need to be considered “neighbours.” When DBSCAN examines a point, it checks all other points within a distance of eps from that point. eps
essentially controls the “reach” or “spread” of clusters. A small eps value will lead to fewer points being considered neighbours, smaller and denser clusters and more points being classified as noise (outliers).
min_samples
specifies the minimum number of points that must be within a point’s eps
neighbourhood for that point to be considered a core point. It sets a density threshold. A point must have at least min_samples neighbours within the eps
radius to be considered part of a dense region. A small min_samples
value will lead to more points being classified as core points and less dense clusters being formed.
DBSCAN identifies: 1) Core points - A point is classified as a “core point” if it has at least min_samples points within its eps
neighbourhood. They form the core of the clusters. 2) Non-core points or norder points - are data points that are within the eps
neighbourhood of core points, but it they don’t have min_samples
points within their own eps
neighbourhood. Border points are considered part of a cluster but are not as central as core points. 3) Noise points - A noise point is a data point that is neither a core nor a border point. Points within each other’s eps neighbourhoods are considered part of the same cluster.
#data_points = bike_points.to_crs(epsg=3301)
#find clusters for the bike points
= DBSCAN(eps=0.001, min_samples=30).fit(bike_points[["X", "Y"]]) #Try to change eps and min_samples values
db = db.labels_
labels
# Number of clusters in labels, ignoring noise if present.
= len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_ = list(labels).count(-1)
n_noise_
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 19
Estimated number of noise points: 2896
bike_points
cyclenumber | time_stamp | second | unix_epoch | X | Y | date | time | geometry | |
---|---|---|---|---|---|---|---|---|---|
0 | 2045 | 19/07/2019 07:02 | 0 | 1563519770 | 26.720922 | 58.380096 | 19/07/2019 | 07:02:50 | POINT (26.72092 58.3801) |
1 | 2045 | 19/07/2019 07:41 | 0 | 1563522110 | 26.720424 | 58.380070 | 19/07/2019 | 07:41:50 | POINT (26.72042 58.38007) |
2 | 2045 | 19/07/2019 08:11 | 0 | 1563523880 | 26.720045 | 58.380042 | 19/07/2019 | 08:11:20 | POINT (26.72005 58.38004) |
3 | 2045 | 19/07/2019 08:11 | 0 | 1563523890 | 26.719941 | 58.380027 | 19/07/2019 | 08:11:30 | POINT (26.71994 58.38003) |
4 | 2045 | 19/07/2019 08:11 | 0 | 1563523900 | 26.719004 | 58.379858 | 19/07/2019 | 08:11:40 | POINT (26.719 58.37986) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5374 | 2784 | 19/07/2019 08:47 | 0 | 1563526050 | 26.721271 | 58.375757 | 19/07/2019 | 08:47:30 | POINT (26.72127 58.37576) |
5375 | 2784 | 19/07/2019 08:47 | 0 | 1563526060 | 26.721718 | 58.376008 | 19/07/2019 | 08:47:40 | POINT (26.72172 58.37601) |
5376 | 2784 | 19/07/2019 08:47 | 0 | 1563526070 | 26.722585 | 58.376385 | 19/07/2019 | 08:47:50 | POINT (26.72259 58.37638) |
5377 | 2784 | 19/07/2019 08:48 | 0 | 1563526090 | 26.724580 | 58.377187 | 19/07/2019 | 08:48:10 | POINT (26.72458 58.37719) |
5378 | 2784 | 19/07/2019 08:48 | 0 | 1563526100 | 26.725410 | 58.377520 | 19/07/2019 | 08:48:20 | POINT (26.72541 58.37752) |
5379 rows × 9 columns
We can see that there are 19 clusters and a lot of noise points. Let’s plot them.
= plt.subplots(1, figsize=(12,7))
fig, ax 'equal')
ax.set_aspect(= set(labels)
unique_labels = np.zeros_like(labels, dtype=bool)
core_samples_mask = True
core_samples_mask[db.core_sample_indices_]
= [plt.cm.Paired(each) for each in np.linspace(0, 1, len(unique_labels))]
colors for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
= [0, 0, 0, 1]
col
= labels == k
class_member_mask
#plotting clusters
= bike_points[class_member_mask & core_samples_mask]
xy
ax.plot("X"],
xy.loc[:, "Y"],
xy.loc[:, "o",
=tuple(col),
markerfacecolor="black",
markeredgecolor=7,
markersize=0.5,
markeredgewidth=2
zorder
)#plotting noise
= bike_points[class_member_mask & ~core_samples_mask]
xy
ax.plot("X"],
xy.loc[:, "Y"],
xy.loc[:, "o",
=tuple(col),
markerfacecolor="black",
markeredgecolor=2,
markersize=0.5,
markeredgewidth=1
zorder
)
# Add basemap
contextily.add_basemap(=contextily.providers.CartoDB.Voyager,
ax, source="EPSG:4326"
crs
)
f"Estimated number of bike GPS point clusters: {n_clusters_}") plt.title(
Text(0.5, 1.0, 'Estimated number of bike GPS point clusters: 19')
Core samples (large dots) and non-core samples (small dots) are color-coded according to the assigned cluster. Samples tagged as noise are represented in black.
We can see that the bike data is not the easiest to find clusters because there is high connectivity along the bike trails and not very clear clustering exists. Let’s try another dataset for DBSCAN clustering. Let’s use OpenStreetMap data set to map the shops of Tartu and see if there is any clustering.
import osmnx as ox
# Get place boundary related to the place name as a geodataframe
= 'Tartu'
place_name = ox.geocode_to_gdf(place_name)
area
area.plot()
# Extract shops from the OSM dataset
= {'shop': True}
tags
= ox.features_from_place('Tartu', tags)
shops ='orange', markersize=1)
shops.plot(color5) shops.head(
geometry | addr:city | addr:country | addr:housenumber | addr:postcode | addr:street | check_date | name | opening_hours | ... | fuel:lpg | fuel:taxfree_diesel | fuel:adblue | shower | old_addr:street | communication:mobile_phone | service | layer | air_conditioning | toilets:wheelchair | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
element | id | |||||||||||||||||||||
node | 266921299 | POINT (26.67666 58.35805) | Tartu | EE | 75 | 50501 | Ringtee | 2024-08-11 | info@lounakeskus.com | Lõunakeskus | Mo-Su 10:00-21:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
322906403 | POINT (26.73024 58.38247) | Tartu | NaN | 20 | 50603 | Raatuse | NaN | NaN | Raatuse Selver ABC | Mo-Su 09:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
331179287 | POINT (26.71956 58.36477) | Tartu | EE | 55f | NaN | Võru | 2024-07-02 | NaN | Maxima XX | Mo-Su 08:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
354470712 | POINT (26.71226 58.37624) | Tartu | NaN | 20 | 50409 | J. Kuperjanovi | 2022-12-22 | NaN | Alko Feenoks | Mo-Su 10:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
361134472 | POINT (26.73618 58.37472) | Tartu | NaN | 14 | NaN | Turu | 2023-01-04 | NaN | Maxima XX | Mo-Su 08:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 147 columns
This time we will transform the WGS84 data to Estonian CRS
#reproject the shops data to Estonian crs
= shops.to_crs(epsg=3301)
proj 3) proj.head(
geometry | addr:city | addr:country | addr:housenumber | addr:postcode | addr:street | check_date | name | opening_hours | ... | fuel:lpg | fuel:taxfree_diesel | fuel:adblue | shower | old_addr:street | communication:mobile_phone | service | layer | air_conditioning | toilets:wheelchair | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
element | id | |||||||||||||||||||||
node | 266921299 | POINT (656645.839 6471739.365) | Tartu | EE | 75 | 50501 | Ringtee | 2024-08-11 | info@lounakeskus.com | Lõunakeskus | Mo-Su 10:00-21:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
322906403 | POINT (659668.71 6474583.027) | Tartu | NaN | 20 | 50603 | Raatuse | NaN | NaN | Raatuse Selver ABC | Mo-Su 09:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
331179287 | POINT (659124.663 6472588.706) | Tartu | EE | 55f | NaN | Võru | 2024-07-02 | NaN | Maxima XX | Mo-Su 08:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 rows × 147 columns
#make geometry for the data
= proj[proj['geometry'].apply(lambda geom: isinstance(geom, Point))].copy()
shop_points 'x'] = shop_points['geometry'].x
shop_points['y'] = shop_points['geometry'].y
shop_points[3) shop_points.head(
geometry | addr:city | addr:country | addr:housenumber | addr:postcode | addr:street | check_date | name | opening_hours | ... | fuel:adblue | shower | old_addr:street | communication:mobile_phone | service | layer | air_conditioning | toilets:wheelchair | x | y | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
element | id | |||||||||||||||||||||
node | 266921299 | POINT (656645.839 6471739.365) | Tartu | EE | 75 | 50501 | Ringtee | 2024-08-11 | info@lounakeskus.com | Lõunakeskus | Mo-Su 10:00-21:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 656645.838896 | 6.471739e+06 |
322906403 | POINT (659668.71 6474583.027) | Tartu | NaN | 20 | 50603 | Raatuse | NaN | NaN | Raatuse Selver ABC | Mo-Su 09:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 659668.709600 | 6.474583e+06 | |
331179287 | POINT (659124.663 6472588.706) | Tartu | EE | 55f | NaN | Võru | 2024-07-02 | NaN | Maxima XX | Mo-Su 08:00-22:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 659124.662764 | 6.472589e+06 |
3 rows × 149 columns
As the coordinates change significantly then the eps
need to be increased also significantly.
= DBSCAN(eps=500, min_samples=10).fit(shop_points[["x", "y"]]) #Try to change eps and min_samples values
db = db.labels_
labels
# Number of clusters in labels, ignoring noise if present.
= len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_ = list(labels).count(-1)
n_noise_
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 6
Estimated number of noise points: 47
= plt.subplots(1, figsize=(12,7))
fig, ax 'equal')
ax.set_aspect(= set(labels)
unique_labels = np.zeros_like(labels, dtype=bool)
core_samples_mask = True
core_samples_mask[db.core_sample_indices_]
= [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
colors for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
= [0, 0, 0, 1]
col
= labels == k
class_member_mask
= shop_points[class_member_mask & core_samples_mask]
xy
ax.plot("x"],
xy.loc[:, "y"],
xy.loc[:, "o",
=tuple(col),
markerfacecolor="black",
markeredgecolor=10,
markersize=0.5
markeredgewidth
)
= shop_points[class_member_mask & ~core_samples_mask]
xy
ax.plot("x"],
xy.loc[:, "y"],
xy.loc[:, "o",
=tuple(col),
markerfacecolor="black",
markeredgecolor=4,
markersize=0.5
markeredgewidth
)
# Add basemap
contextily.add_basemap(=contextily.providers.CartoDB.Voyager,
ax, source="EPSG:3301"
crs
)f"Estimated number of clusters: {n_clusters_}") plt.title(
Text(0.5, 1.0, 'Estimated number of clusters: 6')
Distance based metrics
In the distance based methods, the interest lies in how the points are distributed relative to one another as opposed to how the points are distributed relative to the study extent. We will look into following methods: nearest neighbour distance (NND), G function and hotspot analysis.
For distance based metrics, we will use the landscape fires dataset.
from pointpats import PointPattern
from pointpats import distance_statistics
= gpd.read_file("est-wild_fires.shp")
fires 3) fires.head(
sundmuse_n | sundmuse_k | tulekahju_ | sundmuse_a | mis_poles | wgs_latitu | wgs_longit | maakond | kov | korgem_ast | ... | likvideeri | viimase_re | sundmuse_1 | valjasoidu | valjasoitn | annulleeri | kohal_ress | x | y | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2166474 | 2021-04-25 | Metsa-ja maastikutulekahju | Kulu | Kogutud kuhilad väljaspool rajatisi | 58.946 | 23.538 | Lääne maakond | Haapsalu linn | PAASTE_1A | ... | None | 2021-04-25 13:06:47 | 00:13:42 | 1 | 1.0 | NaN | 1.0 | 473406.543225 | 6.534192e+06 | POINT (473406.543 6534192.248) |
1 | 2166232 | 2021-04-25 | Metsa-ja maastikutulekahju | Mets/maastik | None | 59.395 | 27.281 | Ida-Viru maakond | Kohtla-Järve linn | PAASTE_1 | ... | 2021-04-25 09:20:13 | 2021-04-25 09:30:26 | 02:09:11 | 2 | 2.0 | NaN | 2.0 | 686340.270306 | 6.588675e+06 | POINT (686340.27 6588675.282) |
2 | 2165241 | 2021-04-23 | Metsa-ja maastikutulekahju | Mets/maastik | None | 59.318 | 24.229 | Harju maakond | Lääne-Harju vald | PAASTE_2 | ... | None | 2021-04-23 22:31:33 | 00:33:50 | 8 | 6.0 | 6.0 | 2.0 | 513040.214259 | 6.575561e+06 | POINT (513040.214 6575561.392) |
3 rows × 25 columns
Let’s plot the landscape fires.
= plt.subplots(1, figsize=(12,7))
fig, ax 'equal')
ax.set_aspect(
#ax.plot(color='orange', markersize=1)
"x"], fires["y"], s=1, color="red")
ax.scatter(fires[# Add basemap
contextily.add_basemap(=contextily.providers.CartoDB.Voyager,
ax, source="EPSG:3301"
crs
)"Landscape fires") plt.title(
Text(0.5, 1.0, 'Landscape fires')
Nearest-Neighbor Distance (NND) is the distance between a point and its closest neighboring point. NND is also known as the first-order nearest neighbor. In addition, distance can be calculated for the kth nearest neighbor, which is called the kth-order NN or KNN. The mean of NND between all point pairs is used as a global indicator to measure the overall pattern of a point set.
To calculate NND, we need to firstly make our shop_points x and y from the geodataframe as PointPattern object that pysal can use.
= PointPattern(fires[["x", "y"]])
pp
#calculate mean nearest neighbour of the whole study area
pp.mean_nnd
np.float64(3847.449077094534)
On average, the each fire has its nearest neighbour in 3847m.
G-function The G-function, also known as the nearest neighbour distance distribution function, is a spatial point pattern analysis tool that describes the distribution of distances from each point in a dataset to its nearest neighbour. It’s used to assess whether a point pattern is clustered, dispersed (regular), or random. The G-function calculates the cumulative distribution of nearest neighbour distances, which tells you the proportion of points that have a nearest neighbour within a certain distance. In other words, G(r) is the probability that a randomly chosen point has its nearest neighbour within a distance r.
By comparing the observed G-function to a theoretical G-function (under the assumption of complete spatial randomness, CSR), you can infer the spatial pattern of your points. Under CSR, the points are assumed to be randomly and independently distributed.
Clustered: If the observed G-function is above the expected G-function under CSR, it suggests that the points are clustered. This means that, on average, points are closer to their nearest neighbours than expected by chance.
Dispersed: If the observed G-function is below the expected G-function under CSR, it suggests that the points are dispersed or regularly spaced. This means that, on average, points are farther from their nearest neighbours than expected by chance.
= distance_statistics.g_test(
g_test "x", "y"]]), #refer to the points' coordinates to which to calculate distances
(fires[[=True #this simulates CSR
keep_simulations
)
print(g_test)
GtestResult(support=array([ 0. , 1845.7772965 , 3691.554593 , 5537.3318895 ,
7383.109186 , 9228.88648249, 11074.66377899, 12920.44107549,
14766.21837199, 16611.99566849, 18457.77296499, 20303.55026149,
22149.32755799, 23995.10485449, 25840.88215099, 27686.65944748,
29532.43674398, 31378.21404048, 33223.99133698, 35069.76863348]), statistic=array([0. , 0.50251256, 0.66834171, 0.76633166, 0.82663317,
0.87688442, 0.89949749, 0.92713568, 0.94974874, 0.96733668,
0.9798995 , 0.98743719, 0.98994975, 0.99246231, 0.99748744,
0.99748744, 0.99748744, 0.99748744, 0.99748744, 1. ]), pvalue=array([0.000e+00, 1.000e-04, 1.000e-04, 1.000e-04, 1.000e-04, 1.000e-04,
2.680e-02, 2.831e-01, 1.720e-02, 2.000e-03, 9.000e-04, 9.000e-04,
1.000e-04, 2.000e-04, 4.200e-03, 6.000e-04, 3.000e-04, 1.000e-04,
0.000e+00, 0.000e+00]), simulations=array([[0. , 0.08291457, 0.23869347, ..., 1. , 1. ,
1. ],
[0. , 0.06281407, 0.20351759, ..., 1. , 1. ,
1. ],
[0. , 0.04522613, 0.19346734, ..., 1. , 1. ,
1. ],
...,
[0. , 0.06030151, 0.19346734, ..., 1. , 1. ,
1. ],
[0. , 0.04522613, 0.20854271, ..., 1. , 1. ,
1. ],
[0. , 0.04522613, 0.18844221, ..., 1. , 1. ,
1. ]], shape=(9999, 20)))
= plt.subplots(1, figsize=(10,7))
fig, ax
# plot the observed pattern's G function
ax.plot(
g_test.support,
g_test.statistic, ="observed",
label="red",
color=3
zorder
)
# Plot all the simulations with very fine lines
ax.plot(
g_test.support,
g_test.simulations.T, ="lightgray",
color=0.01,
alpha=2
zorder
)# Plot the median of simulations
ax.plot(
g_test.support,=0),
np.median(g_test.simulations, axis="black",
color="median simulation",
label=1
zorder
)
# Add labels to axes and figure title
"Distance (m)")
ax.set_xlabel("Proportion of points that have a nearest neighbour \nwithin this distance")
ax.set_ylabel(
ax.legend()0, 20000)
ax.set_xlim("Ripley's $G(d)$ function for landscape fires")
ax.set_title(
plt.show()
Our G-function analysis shows that the fire points are clustered until around 12000m neighbourhood and further away they are rather dispersed. That can be also see in visual inspection as they form clusters around cities and further away there are large areas without almost any fires or only few fires.
Hotspot analysis is a spatial statistical technique used to identify clusters of high or low values within a geographic dataset. The difference with other clustering methods is that it also takes into account the data attribute value, i.e. if the attribute value is high or low, and clusters high and low values. It uses Getis-Ord Gi* statistic to identify clusters. It calculates a z-score for each feature in a dataset, indicating whether it is part of a statistically significant hotspot or coldspot. Hotspots are areas where high values cluster, and coldspots are areas where low values cluster.
Hotspot analysis needs one attribute value, and we will use ‘valjasoitn’ for that purpose. ‘valjasoitn’ shows how many Rescue Board resources were actually deployed to the event, which shows the magnitude of the event indirectly. The higher the number, the bigger the fire.
import esda
# lp is still libpysal from above
# Create a spatial weights matrix
= lp.weights.DistanceBand.from_dataframe(fires, threshold=30000) # 1000 is an example distance. Adjust as needed.
w
# Perform Getis-Ord Gi* analysis
= esda.G_Local(fires['valjasoitn'], w)
gi
# Add Gi* results to the GeoDataFrame
'Gi_zscore'] = gi.z_sim
fires['Gi_pvalue'] = gi.p_sim
fires[
# Classify hotspots and coldspots and add them to the dataframe
'hotspot'] = ['hotspot' if z > 1.96 and p < 0.05 else 'coldspot' if z < -1.96 and p < 0.05 else 'not significant' for z, p in zip(fires['Gi_zscore'], fires['Gi_pvalue'])]
fires[
fires.head()
/usr/local/lib/python3.13/site-packages/libpysal/weights/util.py:826: UserWarning: The weights matrix is not fully connected:
There are 2 disconnected components.
There is 1 island with id: 68.
w = W(neighbors, weights, ids, **kwargs)
/usr/local/lib/python3.13/site-packages/libpysal/weights/distance.py:844: UserWarning: The weights matrix is not fully connected:
There are 2 disconnected components.
There is 1 island with id: 68.
W.__init__(
/usr/local/lib/python3.13/site-packages/esda/getisord.py:527: RuntimeWarning: invalid value encountered in divide
z_scores = (statistic - expected_value) / np.sqrt(expected_variance)
('WARNING: ', 68, ' is an island (no neighbors)')
/usr/local/lib/python3.13/site-packages/esda/getisord.py:450: RuntimeWarning: invalid value encountered in divide
self.z_sim = (self.Gs - self.EG_sim) / self.seG_sim
sundmuse_n | sundmuse_k | tulekahju_ | sundmuse_a | mis_poles | wgs_latitu | wgs_longit | maakond | kov | korgem_ast | ... | valjasoidu | valjasoitn | annulleeri | kohal_ress | x | y | geometry | Gi_zscore | Gi_pvalue | hotspot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2166474 | 2021-04-25 | Metsa-ja maastikutulekahju | Kulu | Kogutud kuhilad väljaspool rajatisi | 58.946 | 23.538 | Lääne maakond | Haapsalu linn | PAASTE_1A | ... | 1 | 1.0 | NaN | 1.0 | 473406.543225 | 6.534192e+06 | POINT (473406.543 6534192.248) | 3.487929 | 0.002 | hotspot |
1 | 2166232 | 2021-04-25 | Metsa-ja maastikutulekahju | Mets/maastik | None | 59.395 | 27.281 | Ida-Viru maakond | Kohtla-Järve linn | PAASTE_1 | ... | 2 | 2.0 | NaN | 2.0 | 686340.270306 | 6.588675e+06 | POINT (686340.27 6588675.282) | 0.678703 | 0.240 | not significant |
2 | 2165241 | 2021-04-23 | Metsa-ja maastikutulekahju | Mets/maastik | None | 59.318 | 24.229 | Harju maakond | Lääne-Harju vald | PAASTE_2 | ... | 8 | 6.0 | 6.0 | 2.0 | 513040.214259 | 6.575561e+06 | POINT (513040.214 6575561.392) | -2.604398 | 0.004 | coldspot |
3 | 2164552 | 2021-04-22 | Metsa-ja maastikutulekahju | Kulu | Kogutud kuhilad väljaspool rajatisi | 58.881 | 25.572 | Järva maakond | Paide linn | PAASTE_1 | ... | 3 | 3.0 | 1.0 | 2.0 | 590648.922000 | 6.527923e+06 | POINT (590648.922 6527922.759) | 0.394463 | 0.325 | not significant |
4 | 2164378 | 2021-04-22 | Metsa-ja maastikutulekahju | Kulu | Kulu | 59.360 | 26.986 | Ida-Viru maakond | Lüganuse vald | PAASTE_1 | ... | 2 | 2.0 | NaN | 2.0 | 669771.246017 | 6.583997e+06 | POINT (669771.246 6583997.357) | 0.410972 | 0.325 | not significant |
5 rows × 28 columns
# Plot hotspots and coldspots
= plt.subplots(1, 1, figsize=(10, 10))
fig, ax 'equal')
ax.set_aspect(
= fires[fires['hotspot'] == 'hotspot']
hotspots = fires[fires['hotspot'] == 'coldspot']
coldspots = fires[fires['hotspot'] == 'not significant']
not_significant
=ax, color='red', marker='o', label='Hotspots', markersize=15, zorder=3)
hotspots.plot(ax=ax, color='blue', marker='o', label='Coldspots', markersize=15, zorder=2)
coldspots.plot(ax=ax, color='lightgray', marker='o', label='Not Significant', markersize=14, zorder=1)
not_significant.plot(ax
f"Hotspots and Coldspots of landscape fires")
ax.set_title(
ax.legend()
# Add basemap
contextily.add_basemap(=contextily.providers.CartoDB.Voyager,
ax, source="EPSG:3301"
crs )
We see from the results that there are cldspots around Tartu and Tallinn which means that there is aggregation of small fire incidents. At the same time there are larger fire incidents further away from the cities that makes sense as there is also more fuel for the fire and more likely that the fire gets big before it is discovered.