Commit 20d53311 authored by lucas_miranda's avatar lucas_miranda
Browse files

Added phenotype classification rule to snakemake pipeline

parent 67c7b974
......@@ -293,6 +293,7 @@ coords = project_coords.get_coords(
center=animal_id + undercond + "Center",
align=animal_id + undercond + "Spine_1",
align_inplace=True,
propagate_labels=(pheno_class > 0)
)
distances = project_coords.get_distances()
angles = project_coords.get_angles()
......
......@@ -15,8 +15,9 @@ import os
outpath = "/u/lucasmir/DLC/DLC_autoencoders/DeepOF/deepof/logs/"
losses = ["ELBO"]#, "MMD", "ELBO+MMD"]
encodings = [2, 4, 6, 8, 10, 12, 14, 16]
cluster_numbers = [1, 5, 10, 15, 20]
encodings = [4, 6, 8]#[2, 4, 6, 8, 10, 12, 14, 16]
cluster_numbers = [10, 15]#[1, 5, 10, 15, 20]
pheno_weights = [0.01, 0.1, 0.25, 0.5, 1, 2, 4, 10, 100]
rule deepof_experiments:
input:
......@@ -27,6 +28,14 @@ rule deepof_experiments:
encs=encodings,
k=cluster_numbers,
),
expand(
"/u/lucasmir/DLC/DLC_autoencoders/DeepOF/deepof/logs/pheno_classification_experiments/trained_weights/"
"GMVAE_loss={loss}_encoding={encs}_k={k}_pheno={phenos}_run_1_final_weights.h5",
loss=losses,
encs=encodings,
k=cluster_numbers,
pheno=pheno_weights,
)
# rule coarse_hyperparameter_tuning:
......@@ -71,6 +80,36 @@ rule explore_encoding_dimension_and_loss_function:
"--components {wildcards.k} "
"--input-type coords "
"--predictor 0 "
"--phenotype-classifier 0"
"--variational True "
"--loss {wildcards.loss} "
"--kl-warmup 20 "
"--mmd-warmup 20 "
"--montecarlo-kl 10 "
"--encoding-size {wildcards.encs} "
"--batch-size 256 "
"--window-size 11 "
"--window-step 11 "
"--stability-check 3 "
"--output-path {outpath}dimension_and_loss_experiments"
rule explore_phenotype_classification:
input:
data_path="/u/lucasmir/DLC/DLC_models/deepof_single_topview/",
output:
trained_models=os.path.join(
outpath,
"pheno_classification_experiments/trained_weights/GMVAE_loss={loss}_encoding={encs}_k={k}_pheno={phenos}_run_1_final_weights.h5",
),
shell:
"pipenv run python -m deepof.train_model "
"--train-path {input.data_path} "
"--val-num 5 "
"--components {wildcards.k} "
"--input-type coords "
"--predictor 0 "
"--phenotype-classifier {wildcards.phenos}"
"--variational True "
"--loss {wildcards.loss} "
"--kl-warmup 20 "
......
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
```
%% Cell type:code id: tags:
``` python
import os
os.chdir(os.path.dirname("../"))
```
%% Cell type:code id: tags:
``` python
import deepof.data
import deepof.models
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
import tqdm.notebook as tqdm
from ipywidgets import interact
```
%% Cell type:markdown id: tags:
# Retrieve phenotypes
%% Cell type:code id: tags:
``` python
flatten = lambda t: [item for sublist in t for item in sublist]
```
%% Cell type:code id: tags:
``` python
# Load first batch
dset11 = pd.ExcelFile(
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_1/DLC_single_CDR1_1/1.Openfield_data-part1/JB05.1-OF-SI-part1.xlsx"
)
dset12 = pd.ExcelFile(
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_1/DLC_single_CDR1_1/2.Openfielddata-part2/AnimalID's-JB05.1-part2.xlsx"
)
dset11 = pd.read_excel(dset11, "Tabelle2")
dset12 = pd.read_excel(dset12, "Tabelle2")
dset11.Test = dset11.Test.apply(lambda x: "Test {}_s1.1".format(x))
dset12.Test = dset12.Test.apply(lambda x: "Test {}_s1.2".format(x))
dset11.Test = dset11.Test.apply(lambda x: "Test {}_s11".format(x))
dset12.Test = dset12.Test.apply(lambda x: "Test {}_s12".format(x))
dset1 = {"CSDS":list(dset11.loc[dset11.Treatment.isin(["CTR+CSDS","NatCre+CSDS"]), "Test"]) +
list(dset12.loc[dset12.Treatment.isin(["CTR+CSDS","NatCre+CSDS"]), "Test"]),
"NS": list(dset11.loc[dset11.Treatment.isin(["CTR+nonstressed","NatCre+nonstressed"]), "Test"]) +
list(dset12.loc[dset12.Treatment.isin(["CTR+nonstressed","NatCre+nonstressed"]), "Test"]),}
dset1inv = {}
for i in flatten(list(dset1.values())):
if i in dset1["CSDS"]:
dset1inv[i] = "CSDS"
else:
dset1inv[i] = "NS"
assert len(dset1inv) == dset11.shape[0] + dset12.shape[0], "You missed some labels!"
```
%% Cell type:code id: tags:
``` python
# Load second batch
dset21 = pd.read_excel(
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_2/Part1/2_Single/stressproject22.04.2020genotypes-openfieldday1.xlsx"
)
dset22 = pd.read_excel(
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_2/Part2/2_Single/OpenFieldvideos-part2.xlsx"
)
dset21.Test = dset21.Test.apply(lambda x: "Test {}_s2.1".format(x))
dset22.Test = dset22.Test.apply(lambda x: "Test {}_s2.2".format(x))
dset21.Test = dset21.Test.apply(lambda x: "Test {}_s21".format(x))
dset22.Test = dset22.Test.apply(lambda x: "Test {}_s22".format(x))
dset2 = {"CSDS":list(dset21.loc[dset21.Treatment == "Stress", "Test"]) +
list(dset22.loc[dset22.Treatment == "Stressed", "Test"]),
"NS": list(dset21.loc[dset21.Treatment == "Nonstressed", "Test"]) +
list(dset22.loc[dset22.Treatment == "Nonstressed", "Test"])}
dset2inv = {}
for i in flatten(list(dset2.values())):
if i in dset2["CSDS"]:
dset2inv[i] = "CSDS"
else:
dset2inv[i] = "NS"
assert len(dset2inv) == dset21.shape[0] + dset22.shape[0], "You missed some labels!"
```
%% Cell type:code id: tags:
``` python
# Load third batch
dset31 = pd.read_excel(
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_3/1.Day2OF-SIpart1/JB05 2Female-ELS-OF-SIpart1.xlsx"
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_3/1.Day2OF-SIpart1/JB05 2Female-ELS-OF-SIpart1.xlsx",
sheet_name=1
)
dset32 = pd.read_excel(
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_3/2.Day3OF-SIpart2/JB05 2FEMALE-ELS-OF-SIpart2.xlsx"
"../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_3/2.Day3OF-SIpart2/JB05 2FEMALE-ELS-OF-SIpart2.xlsx",
sheet_name=1
)
dset31.Test = dset31.Test.apply(lambda x: "Test {}_s3.1".format(x))
dset32.Test = dset32.Test.apply(lambda x: "Test {}_s3.2".format(x))
dset31.Test = dset31.Test.apply(lambda x: "Test {}_s31".format(x))
dset32.Test = dset32.Test.apply(lambda x: "Test {}_s32".format(x))
dset3 = {"CSDS":[],
"NS": list(dset31.loc[:, "Test"]) +
list(dset32.loc[:, "Test"])}
dset3inv = {}
for i in flatten(list(dset3.values())):
if i in dset3["CSDS"]:
dset3inv[i] = "CSDS"
else:
dset3inv[i] = "NS"
assert len(dset3inv) == dset31.shape[0] + dset32.shape[0], "You missed some labels!"
```
%% Cell type:code id: tags:
``` python
# Load fourth batch
dset41 = os.listdir("../../Desktop/deepof-data/tagged_videos/Individual_datasets/DLC_batch_4/JB05.4-OpenFieldvideos/")
# Remove empty video!
dset41 = [vid for vid in dset41 if "52" not in vid]
dset4 = {"CSDS":[],
"NS": [i[:-4]+"_s4" for i in dset41]}
"NS": [i[:-4]+"_s41" for i in dset41]}
dset4inv = {}
for i in flatten(list(dset4.values())):
if i in dset4["CSDS"]:
dset4inv[i] = "CSDS"
else:
dset4inv[i] = "NS"
assert len(dset4inv) == len(dset41), "You missed some labels!"
```
%% Cell type:code id: tags:
``` python
# Merge phenotype dicts and serialise!
aggregated_dset = {**dset1inv, **dset2inv, **dset3inv, **dset4inv}
```
%% Cell type:code id: tags:
``` python
from collections import Counter
print(Counter(aggregated_dset.values()))
print(115+52)
```
%% Output
Counter({'NS': 115, 'CSDS': 52})
167
%% Cell type:code id: tags:
``` python
# Save aggregated dataset to disk
import pickle
with open("../../Desktop/deepof-data/deepof_single_topview/deepof_exp_conditions.pkl", "wb") as handle:
pickle.dump(aggregated_dset, handle, protocol=pickle.HIGHEST_PROTOCOL)
```
%% Cell type:markdown id: tags:
# Define and run project
%% Cell type:code id: tags:
``` python
%%time
deepof_main = deepof.data.project(path=os.path.join("..","..","Desktop","deepof-data","deepof_single_topview"),
smooth_alpha=0.99,
arena_dims=[380],
#exclude_bodyparts=["Tail_1", "Tail_2", "Tail_tip", "Tail_base", "Spine_2"]
#exp_conditions=dset2inv
exp_conditions=aggregated_dset
)
```
%% Output
CPU times: user 28.1 s, sys: 4.99 s, total: 33.1 s
Wall time: 7.5 s
CPU times: user 28 s, sys: 4.9 s, total: 32.9 s
Wall time: 7.4 s
%% Cell type:code id: tags:
``` python
%%time
deepof_main = deepof_main.run(verbose=True)
print(deepof_main)
```
%% Output
Loading trajectories...
Smoothing trajectories...
Computing distances...
Computing angles...
Done!
deepof analysis of 167 videos
CPU times: user 46.6 s, sys: 5.09 s, total: 51.7 s
Wall time: 53 s
Coordinates of 167 videos across 2 conditions
CPU times: user 46.3 s, sys: 3.75 s, total: 50 s
Wall time: 52.4 s
%% Cell type:code id: tags:
``` python
all_quality = pd.concat([tab for tab in deepof_main.get_quality().values()]).droplevel("scorer", axis=1)
```
%% Cell type:code id: tags:
``` python
all_quality.boxplot(rot=45)
plt.ylim(0.99985, 1.00001)
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
@interact(quality_top=(0., 1., 0.01))
def low_quality_tags(quality_top):
pd.DataFrame(pd.melt(all_quality).groupby("bodyparts").value.apply(
lambda y: sum(y<quality_top) / len(y) * 100)
).sort_values(by="value", ascending=False).plot.bar(rot=45)
plt.xlabel("body part")
plt.ylabel("Tags with quality under {} (%)".format(quality_top))
plt.tight_layout()
plt.legend([])
plt.show()
```
%% Output
%% Cell type:markdown id: tags:
# Generate coords
%% Cell type:code id: tags:
``` python
%%time
deepof_coords = deepof_main.get_coords(center="Center", polar=False, speed=0, align="Spine_1", align_inplace=True, propagate_labels=False)
deepof_coords = deepof_main.get_coords(center="Center", polar=False, speed=0, align="Spine_1", align_inplace=True, propagate_labels=True)
#deepof_dists = deepof_main.get_distances(propagate_labels=False)
#deepof_angles = deepof_main.get_angles(propagate_labels=False)
```
%% Output
CPU times: user 54.3 s, sys: 475 ms, total: 54.8 s
Wall time: 54.8 s
CPU times: user 1min 2s, sys: 2.29 s, total: 1min 4s
Wall time: 1min 7s
%% Cell type:markdown id: tags:
# Visualization
%% Cell type:code id: tags:
``` python
%%time
tf.keras.backend.clear_session()
print("Preprocessing training set...")
deepof_train = deepof_coords.preprocess(
window_size=11,
window_step=11,
conv_filter=None,
scale="minmax",
shuffle=True,
test_videos=0,
)[0]
print("Loading pre-trained model...")
encoder, decoder, grouper, gmvaep, = deepof.models.SEQ_2_SEQ_GMVAE(
loss="ELBO",
number_of_components=10,
compile_model=True,
kl_warmup_epochs=20,
montecarlo_kl=10,
encoding=6,
mmd_warmup_epochs=20,
predictor=0,
phenotype_prediction=0,
).build(deepof_train.shape)[:4]
gmvaep.load_weights("../../Desktop/GMVAE_loss=ELBO_encoding=6_run_0_final_weights.h5")
```
%% Output
Preprocessing training set...
Loading pre-trained model...
(227686, 11, 26)
CPU times: user 3.35 s, sys: 656 ms, total: 4.01 s
Wall time: 4.07 s
%% Cell type:code id: tags:
``` python
video_key = np.random.choice(list(deepof_coords.keys()), 1)[0]
video_input = deepof.data.table_dict({video_key:deepof_coords[video_key]}, typ="coords").preprocess(
window_size=11,
window_step=1,
conv_filter=None,
scale="standard",
shuffle=False,
test_videos=0,
)[0]
scaler = StandardScaler()
scaler.fit(np.array(pd.concat(list(deepof_coords.values()))))
# Get reconstruction
video_pred = gmvaep.predict(video_input)[:, 6, :]
# Get encodings
# video_clusters = grouper.predict(video_input)
# video_encodings = encoder.predict(video_input)
scaled_video_pred = scaler.inverse_transform(video_pred)
scaled_video_input = scaler.inverse_transform(video_input[:, 6, :])
scaled_video_input = pd.DataFrame(scaled_video_input, columns=deepof_coords[video_key].columns)
scaled_video_pred = pd.DataFrame(scaled_video_pred, columns=deepof_coords[video_key].columns)
```
%% Cell type:code id: tags:
``` python
tf.reduce_mean( tf.keras.losses.mean_absolute_error(video_pred, video_input[:, 6, :]))
```
%% Output
<tf.Tensor: shape=(), dtype=float64, numpy=0.8289839462658529>
%% Cell type:code id: tags:
``` python
# Draft: function to produce a video with the animal in motion using cv2
import cv2
w = 400
h = 400
factor = 2.5
# Instantiate video
writer = cv2.VideoWriter()
writer.open(
"test_video.avi",
cv2.VideoWriter_fourcc(*"MJPG"),
24,
(int(w * factor), int(h * factor)),
True,
)
for frame in tqdm.tqdm(range(100)):
image = np.zeros((h, w, 3), np.uint8) + 30
for bpart in scaled_video_input.columns.levels[0]:
try:
pos = (
(-int(scaled_video_input[bpart].loc[frame, "x"]) + w // 2),
(-int(scaled_video_input[bpart].loc[frame, "y"]) + h // 2),
)
pos_pred = (
(-int(scaled_video_pred[bpart].loc[frame, "x"]) + w // 2),
(-int(scaled_video_pred[bpart].loc[frame, "y"]) + h // 2),
)
cv2.circle(image, pos, 2, (0, 0, 255), -1)
cv2.circle(image, pos_pred, 2, (0, 255, 0), -1)
except KeyError:
continue
# draw skeleton
def draw_line(start, end, df, col):
for bpart in end:
cv2.line(
image,
tuple(-df[start].loc[frame, :].astype(int) + w // 2),
tuple(-df[bpart].loc[frame, :].astype(int) + h // 2),
col,
1,
)
for df, col in zip([scaled_video_input, scaled_video_pred], [(0,0,255),(0,255,0)]):
draw_line("Nose", ["Left_ear", "Right_ear"], df, col)
draw_line("Spine_1", ["Left_ear", "Right_ear", "Left_fhip", "Right_fhip"], df, col)
draw_line("Spine_2", ["Spine_1", "Tail_base", "Left_bhip", "Right_bhip"], df, col)
draw_line("Tail_1", ["Tail_base", "Tail_2"], df, col)
draw_line("Tail_tip", ["Tail_2"], df, col)
image = cv2.resize(image, (0, 0), fx=factor, fy=factor)
writer.write(image)
writer.release()
cv2.destroyAllWindows()
```
%% Output
%% Cell type:code id: tags:
``` python
samples = 100000
all_clusters = grouper.predict(deepof_train[:samples])
all_encodings = encoder.predict(deepof_train[:samples])
```
%% Cell type:code id: tags:
``` python
all_clusters
```
%% Cell type:code id: tags:
``` python
sns.scatterplot(all_encodings[:,0], all_encodings[:,1], hue=np.argmax(all_clusters,axis=1))
```
%% Cell type:code id: tags:
``` python
# import umap
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# import plotly.express as px
```
%% Cell type:code id: tags:
``` python
def plot_encodings(data, samples, n, clusters, threshold, highlight=None):
reducer = LinearDiscriminantAnalysis(n_components=n)
clusters = clusters[:samples, :]
# filter = np.max(np.mean(clusters, axis=0), axis=1) > threshold
clusters = np.argmax(clusters, axis=1)#[filter]
rep = reducer.fit_transform(data[:samples], clusters)
if n == 2:
df = pd.DataFrame({"encoding-1":rep[:,0],"encoding-2":rep[:,1],"clusters":["A"+str(i) for i in clusters]})
enc = px.scatter(data_frame=df, x="encoding-1", y="encoding-2",
color="clusters", width=600, height=600,
color_discrete_sequence=px.colors.qualitative.T10)
#if highlight:
# ig.add_trace(go.Scatter(x=, y=)
elif n == 3:
df3d = pd.DataFrame({"encoding-1":rep[:,0],"encoding-2":rep[:,1],"encoding-3":rep[:,2],
"clusters":["A"+str(i) for i in clusters]})
enc = px.scatter_3d(data_frame=df3d, x="encoding-1", y="encoding-2", z="encoding-3",
color="clusters", width=600, height=600,
color_discrete_sequence=px.colors.qualitative.T10)
return enc
```
%% Cell type:code id: tags:
``` python
plot_encodings(all_encodings, 10000, 2, all_clusters, 1, 10)
```
%% Cell type:markdown id: tags:
# Preprocessing
%% Cell type:code id: tags:
``` python
%%time
X_train, y_train, X_test, y_test = deepof_coords.preprocess(window_size=11, window_step=11, conv_filter=None, sigma=55,
shift=0, scale='standard', align='all', shuffle=True, test_videos=5)
print("Train dataset shape: ", X_train.shape)
print("Train dataset shape: ", y_train.shape)
print("Test dataset shape: ", X_test.shape)
print("Test dataset shape: ", y_test.shape)
```
%% Cell type:markdown id: tags:
# Build models and get learning rate (1-cycle policy)
%% Cell type:markdown id: tags:
### Seq 2 seq Variational Auto Encoder
%% Cell type:code id: tags:
``` python
from datetime import datetime
import tensorflow.keras as k
import tensorflow as tf
```