Physical activity classification using smartphone-data

The goal of this project is to predict the type of physical activity (e.g., walking, climbing stairs) from tri-axial smartphone accelerometer data.
project
machine-learning

Data Loading and Exploration

load essential libraries

import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
X = pd.read_csv("data/har/time_series.csv") 
y = pd.read_csv("data/har/labels.csv").label

activities = {1:'standing', 2:'walking', 3:'stairs-down', 4:'stairs-up'}
labels = []
for i in range(len(y)):
    label = np.repeat(y[i], 9)
    labels.extend([*label, y[i]])
    
X['label'] = labels[:-6]
y = X.label
X.head()
Unnamed: 0 timestamp UTC time accuracy x y z label
0 20586 1565109930787 2019-08-06T16:45:30.787 unknown -0.006485 -0.934860 -0.069046 1
1 20587 1565109930887 2019-08-06T16:45:30.887 unknown -0.066467 -1.015442 0.089554 1
2 20588 1565109930987 2019-08-06T16:45:30.987 unknown -0.043488 -1.021255 0.178467 1
3 20589 1565109931087 2019-08-06T16:45:31.087 unknown -0.053802 -0.987701 0.068985 1
4 20590 1565109931188 2019-08-06T16:45:31.188 unknown -0.054031 -1.003616 0.126450 1
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3744 entries, 0 to 3743
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  3744 non-null   int64  
 1   timestamp   3744 non-null   int64  
 2   UTC time    3744 non-null   object 
 3   accuracy    3744 non-null   object 
 4   x           3744 non-null   float64
 5   y           3744 non-null   float64
 6   z           3744 non-null   float64
 7   label       3744 non-null   int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 234.1+ KB
y
0       1
1       1
2       1
3       1
4       1
       ..
3739    4
3740    4
3741    4
3742    4
3743    4
Name: label, Length: 3744, dtype: int64
standing = X.label == 1
walking = X.label == 2
stairs_down = X.label == 3
stairs_up = X.label == 4

x = np.linspace(0, len(labels)-6, len(labels)-6)

mpl.style.use("fivethirtyeight")
%matplotlib notebook

fig, ax = plt.subplots(2, 2, figsize=(12, 8))
ax[0, 0].plot(x[standing], X.x[standing], x[standing],
              X.y[standing], x[standing], X.z[standing], '-', alpha=0.4)
ax[0, 0].set_title(activities[1])

ax[0, 1].plot(x[walking], X.x[walking], x[walking],
              X.y[walking], x[walking], X.z[walking], '-', alpha=0.4)
ax[0, 1].set_title(activities[2])

ax[1, 0].plot(x[stairs_down],
              X.x[stairs_down], x[stairs_down],
              X.y[stairs_down], x[stairs_down],
              X.z[stairs_down], '-', alpha=0.4)
ax[1, 0].set_title(activities[3])

ax[1, 1].plot(X.timestamp[stairs_up], X.x[stairs_up], X.timestamp[stairs_up],
              X.y[stairs_up], X.timestamp[stairs_up], X.z[stairs_up], '-', alpha=0.4)
ax[1, 1].set_title(activities[4])

fig.suptitle("Tri-Axial Linear Acceleration", fontsize=25)
plt.gcf().autofmt_xdate()
fig.text(0.5, 0.05, 'time', ha='center', fontsize=16)
fig.text(0.01, 0.5, 'acceleration', va='center', rotation='vertical', fontsize=16)
fig.show()
mpl.style.use("fivethirtyeight")
plt.plot(X.timestamp, X.x, X.timestamp, X.y, X.timestamp, X.z, '-', alpha=0.4)
plt.title("Tri-Axial Linear Acceleration")
plt.xlabel("time")
plt.ylabel("acceleration")
plt.gcf().autofmt_xdate()
plt.show()
walking = X.label == 1
standing = X.label == 2
stairs_down = X.label == 3
stairs_up = X.label == 4
%matplotlib notebook
fig,axs = plt.subplots(4,1, figsize = (16,12), sharex=True)
sns.kdeplot(X.x[walking], shade=True, ax=axs[0])
sns.kdeplot(X.y[walking], shade=True, ax=axs[0])
sns.kdeplot(X.z[walking], shade=True, ax=axs[0])

sns.kdeplot(X.x[standing], shade=True, ax=axs[1])
sns.kdeplot(X.y[standing], shade=True, ax=axs[1])
sns.kdeplot(X.z[standing], shade=True, ax=axs[1])

sns.kdeplot(X.x[stairs_down], shade=True, ax=axs[2])
sns.kdeplot(X.y[stairs_down], shade=True, ax=axs[2])
sns.kdeplot(X.z[stairs_down], shade=True, ax=axs[2])

sns.kdeplot(X.x[stairs_up], shade=True, ax=axs[3])
sns.kdeplot(X.y[stairs_up], shade=True, ax=axs[3])
sns.kdeplot(X.z[stairs_up], shade=True, ax=axs[3])

axs[0].set_title(activities[1])
axs[1].set_title(activities[2])
axs[2].set_title(activities[3])
axs[3].set_title(activities[4])

axs[0].set_xlim((-3,2))
fig.suptitle("Tri-Axial Acceralometer Data", fontsize=20)
fig.show()

Modelling

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

train_covariates = X[['x', 'y', 'z']]
target = X.label
clf = RandomForestClassifier(max_depth=10, random_state=0)

def correlation(estimator, X, y):
    estimator.fit(X,y)
    y_pred = estimator.predict(X)
    return r2_score(y, y_pred)

def accuracy(estimator, X, y):
    estimator.fit(X,y)
    y_pred = estimator.predict(X)
    return accuracy_score(y, y_pred)

test_score = accuracy(clf, train_covariates, target)

val_scores = cross_val_score(clf,
                         train_covariates,
                         target,
                         cv=10,
                         scoring=accuracy)
scores
array([0.97066667, 0.968     , 0.96      , 0.976     , 0.93582888,
       0.9973262 , 0.94919786, 0.98930481, 0.98128342, 0.99197861])
test_score
0.7641559829059829
scores.mean()
0.9719586452762924