Gems/MotionMatching/JupyterNotebooks/FeatureAnalysis.ipynb
featureMatrixFilePath = 'E:/MotionMatchingFeatureMatrix.csv'
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
def PrintGreen(text):
print('\x1b[6;30;42m' + text + '\x1b[0m')
def PrintRed(text):
print('\33[41m' + text + '\x1b[0m')
# Load the feature matrix from CSV
originalData = pd.read_csv(featureMatrixFilePath, na_values = 'null')
if originalData.shape[0] > 0 and originalData.shape[1] > 0:
PrintGreen("Loading succeeded");
else:
PrintRed("Loading failed!");
print("frames = " + str(originalData.shape[0]))
print("featureComponents = " + str(originalData.shape[1]))
# Ensure to show all columns
pd.set_option('max_columns', originalData.shape[1])
originalData.head(15)
Remove columns containing only 0.0
def CleanData(data):
# Remove columns with only zeros
cleanedData = data[data.columns[(data != 0).any()]]
if cleanedData.shape[0] != data.shape[0]:
PrintRed("Frame count of original and cleaned data should match!")
if cleanedData.shape[1] < data.shape[1]:
PrintGreen(str(data.shape[1] - cleanedData.shape[1]) + " feature components containing only 0.0 values removed");
print("frames = " + str(cleanedData.shape[0]))
print("featureComponents = " + str(cleanedData.shape[1]))
return cleanedData
cleanedData = CleanData(originalData);
frameCount = cleanedData.shape[0]
cleanedFeatureComponentCount = cleanedData.shape[1]
cleanedData.head(15)
def Histogram(data):
image = data.hist(figsize = [32, 32])
Histogram(cleanedData)
Median in orange inside the box
Box = Interquartile range, which means 50% of the data lies within the box
Black line range = 99,3% of the values
Semi-transparent outliers represent the rest 0.7%
def BoxPlot(data, featureComponentCount):
minValuePerColumn = data.min(axis=0)
maxValuePerColumn = data.max(axis=0)
fig1, ax1 = plt.subplots(figsize=(20,20))
ax1.set_title('Feature Component Boxplot')
# Render outliers
flierprops = dict(marker='o', markerfacecolor='gainsboro', markersize=1, linestyle='none', markeredgecolor='gainsboro', alpha=0.005)
ax1.boxplot(data, vert=False, flierprops=flierprops)
# Create an array containing values ranging from 1 to featureComponentCount
elementNumbers = np.array([i+1 for i in range(featureComponentCount)])
plt.yticks(elementNumbers, data.columns)
plt.show()
BoxPlot(cleanedData, cleanedData.shape[1])
# not used in drawing, this just prints the values
correlationMatrix = cleanedData.corr()
# plot the correlation heatmap
plt.figure(figsize=[32, 32])
sns.heatmap(data=correlationMatrix)
Use principal component analysis to project the multi-dimensional data down to 2D
def ScatterPlotPCA(data):
pca = PCA(n_components=2)
pca.fit(data)
pcaData = pca.transform(data)
pca_x = pcaData[:, 0]
pca_y = pcaData[:, 1]
plt.figure(figsize=(16, 16))
plt.scatter(pca_x, pca_y, s=2.0, alpha=0.5)
ScatterPlotPCA(cleanedData)
# mean normalization
# normalized_df=(df-df.mean())/df.std()
# min-max normalization
# normalized_df=(df-df.min())/(df.max()-df.min())
# Note: Pandas automatically applies colomn-wise function in the code above.
# Using sklearn
x = cleanedData.values
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
x_scaled = min_max_scaler.fit_transform(x)
normalizedData = pd.DataFrame(data=x_scaled, columns=cleanedData.columns) # copy column names from source
# min values per column used to normalize the data
print("Minimum values per feature component / column")
print(min_max_scaler.data_min_)
print("")
# max values per column used to normalize the data
print("Maximum values per feature component / column")
print(min_max_scaler.data_max_)
normalizedData.head(15)
Histogram(normalizedData)
ScatterPlotPCA(normalizedData)