import pandas as pd

df = pd.read_csv("leak_data_2022_2024.csv", parse_dates=['JOUR']).set_index('JOUR')

df.head()

import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(nrows=2)
df.plot(y='M3', ax=ax1)
df.plot(y='RELEVÉ', ax=ax2)
ax1.set_title('Daily water consumption (m³)')
ax2.set_title('Daily water index (m³)')
fig.tight_layout()

df_monthly = df.resample('ME').sum()

plt.bar(df_monthly.index, df_monthly['M3'], width=25)
plt.xticks(df_monthly.index, rotation=90);

df_monthly['month'] = df_monthly.index.strftime('%m')

pd.DataFrame([df_monthly.set_index('month')['M3'].iloc[:12], df_monthly.set_index('month')['M3'].iloc[12:-1]], index=['year before (2022-2023)', 'this year (2023-2024)']).T.plot.bar()

<Axes: xlabel='month'>

from scipy.stats import linregress

def index_to_num(index):
    return index.values.astype(float) / 1e9 / 3600 / 24

def compute_slope(s):
    x_num = index_to_num(s.index) 
    y = s.values
    reg = linregress(x_num, y)
    return reg.slope

rolling_slopes = df['RELEVÉ'].rolling(30, center=True).apply(func=compute_slope)

rolling_slopes.describe()

count    712.000000
mean       0.422559
std        0.110859
min        0.168674
25%        0.348944
50%        0.414397
75%        0.460289
max        0.696360
Name: RELEVÉ, dtype: float64

fig, (ax1, ax2) = plt.subplots(nrows=2)
df['RELEVÉ'].plot(ax=ax1)
rolling_slopes.plot(ax=ax2)
ax1.set_title('Daily water index (m³)')
ax2.set_title('Daily water index slope (m³/day)')
fig.tight_layout()

fig, ax = plt.subplots()

df.plot(y='RELEVÉ', ax=ax)


slope1 = compute_slope(df['RELEVÉ'].iloc[:365])
slope2 = compute_slope(df['RELEVÉ'].iloc[-180:])

ax.plot(df.index, 
        df['RELEVÉ'][0] + (index_to_num(df.index) - index_to_num(df.index)[0]) * slope1,
        label=f'fit at start, slope: {slope1:.3} m³/day')

ax.plot(df.index, 
        df['RELEVÉ'][-1] - (index_to_num(df.index)[-1] - index_to_num(df.index)) * slope2,
        label=f'fit at end, slope: {slope2:.3} m³/day')

ax.legend();

C:\Users\FL232714\AppData\Local\Temp\ipykernel_32\3281566500.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  df['RELEVÉ'][0] + (index_to_num(df.index) - index_to_num(df.index)[0]) * slope1,
C:\Users\FL232714\AppData\Local\Temp\ipykernel_32\3281566500.py:14: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  df['RELEVÉ'][-1] - (index_to_num(df.index)[-1] - index_to_num(df.index)) * slope2,

df['M3'].hist(bins=50)
plt.title('daily water consumption (m³)')
plt.ylabel('frequency')

Text(0, 0.5, 'frequency')

from scipy.stats import gamma

data = df['M3'].values
shape, loc, scale = gamma.fit(data, loc=0)

shape, loc, scale

(10.893046994811282, -0.20049479019006658, 0.057162527280664094)

import numpy as np

df['M3'].hist(bins=50, density=True)
plt.title('daily water consumption (m³)')
plt.ylabel('frequency')
x = np.linspace(0, 3.5, num=1000)
plt.plot(x, gamma.pdf(x, shape, loc, scale))

[<matplotlib.lines.Line2D at 0x219a5d4a5e0>]

import pymc as pm

with pm.Model() as model:
    alpha = pm.Uniform("alpha", lower=0, upper=20)  # Flat prior for shape parameter
    beta = pm.Uniform("beta", lower=0, upper=20)  # Flat prior for shape parameter
    
    # Likelihood (sampling distribution) of observations
    obs = pm.Gamma("obs", alpha=alpha, beta=beta, observed=data)
    
    # Sample from the posterior
    trace = pm.sample(5000, tune=1000, return_inferencedata=True)

# Plot the posterior distributions
pm.plot_posterior(trace, var_names=["alpha", "beta"])

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]

Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 42 seconds.

array([<Axes: title={'center': 'alpha'}>,
       <Axes: title={'center': 'beta'}>], dtype=object)

with model:
    posterior_predictive = pm.sample_posterior_predictive(trace, var_names=["obs"], predictions=True)


posterior_samples = posterior_predictive.predictions['obs'].values.ravel()

df['M3'].hist(bins=50, density=True, label='data', alpha=0.5)
plt.title('daily water consumption (m³)')
plt.ylabel('frequency')
plt.hist(posterior_samples, bins=50, density=True, label='posterior', alpha=0.5)

plt.legend()

Sampling: [obs]

<matplotlib.legend.Legend at 0x219aec63e20>

alpha_mean, beta_mean = trace.posterior.alpha.mean().values, trace.posterior.beta.mean().values

x = np.linspace(0, data.max(), 1000)
gamma_pdf = gamma.pdf(x, a=alpha_mean, scale=1/beta_mean)

plt.hist(data, bins=30, density=True, alpha=0.5, label="Data histogram")
plt.plot(x, gamma_pdf, 'r-', lw=2, label=f'Gamma PDF (pymc)\n(alpha={alpha_mean:.2f}, beta={beta_mean:.2f})')
plt.plot(x, gamma.pdf(x, shape, loc, scale), label='Gamma PDF (scipy)')
plt.xlabel("Value")
plt.ylabel("Density")
plt.legend()
plt.show()

n_count_data = data.size

with pm.Model() as model:
    alpha_1 = pm.Uniform("alpha_1", lower=0, upper=20)  # Flat prior for shape parameter
    beta_1 = pm.Uniform("beta_1", lower=0, upper=20)  # Flat prior for shape parameter
    alpha_2 = pm.Uniform("alpha_2", lower=0, upper=20)  # Flat prior for shape parameter
    beta_2 = pm.Uniform("beta_2", lower=0, upper=20)  # Flat prior for shape parameter
    idx = np.arange(n_count_data) # Index
    tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data - 1)
    alpha_ = pm.math.switch(tau > idx, alpha_1, alpha_2)
    beta_ = pm.math.switch(tau > idx, beta_1, beta_2)
    
    # Likelihood (sampling distribution) of observations
    obs = pm.Gamma("obs", alpha=alpha_, beta=beta_, observed=data)

    # Sample from the posterior
    trace = pm.sample(5000, tune=1000, return_inferencedata=True)

Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>NUTS: [alpha_1, beta_1, alpha_2, beta_2]
>Metropolis: [tau]

Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 65 seconds.

pm.plot_posterior(trace)
plt.tight_layout()

alpha_1_samples = trace.posterior.alpha_1.values.ravel()
alpha_2_samples = trace.posterior.alpha_2.values.ravel()
beta_1_samples = trace.posterior.beta_1.values.ravel()
beta_2_samples = trace.posterior.beta_2.values.ravel()
tau_samples = trace.posterior.tau.values.ravel()

fig, axes = plt.subplots(nrows=3, figsize=(10, 10))

for ax, vars, labels in zip(axes.ravel(), 
                   [[alpha_1_samples, alpha_2_samples], [beta_1_samples, beta_2_samples], [tau_samples]],
                   ['alpha_1_samples, alpha_2_samples'.split(', '), 'beta_1_samples, beta_2_samples'.split(', '), ['tau_samples']]):
    for var, label in zip(vars, labels):
        ax.hist(var, bins=50, label=label)
    ax.legend()
fig.tight_layout()

alpha_mean1, beta_mean1 = trace.posterior.alpha_1.mean().values, trace.posterior.beta_1.mean().values
alpha_mean2, beta_mean2 = trace.posterior.alpha_2.mean().values, trace.posterior.beta_2.mean().values

x = np.linspace(0, data.max(), 1000)
gamma_pdf1 = gamma.pdf(x, a=alpha_mean1, scale=1/beta_mean1)
gamma_pdf2 = gamma.pdf(x, a=alpha_mean2, scale=1/beta_mean2)

plt.hist(data, bins=30, density=True, alpha=0.5, label="Data histogram")
plt.plot(x, gamma_pdf1, 'r-', lw=2, label=f'Gamma PDF $t<\\tau$\n(alpha1={alpha_mean1:.2f}, beta1={beta_mean1:.2f})')
plt.plot(x, gamma_pdf2, 'r-', lw=2, label=f'Gamma PDF $t>\\tau$\n(alpha2={alpha_mean2:.2f}, beta2={beta_mean2:.2f})')
plt.xlabel("Value")
plt.ylabel("Density")
plt.legend()
plt.show()

fig, (ax2, ax1) = plt.subplots(sharex=True, nrows=2)
ax1.plot(data)
ax1.set_title("daily water consumption (m³)")
ax2.hist(trace.posterior.tau.values.ravel(), density=True);
ax2.set_title("probability of switch time τ")
ax1.fill_between(np.arange(n_count_data)[:-1], 3 * (np.histogram(trace.posterior.tau.values.ravel(), bins=np.arange(n_count_data))[0] > 0.007), alpha=0.5, label='leak time')
ax1.legend()

<matplotlib.legend.Legend at 0x219ace18c10>

with model:
    predictions = pm.sample_posterior_predictive(trace, var_names=["obs"], predictions=True)

Sampling: [obs]

predictions_samples = predictions.predictions['obs'].values.reshape(-1, 741)

fig, ax = plt.subplots()
ax.plot(df.index, predictions_samples[np.random.choice(np.arange(predictions_samples.shape[0]), size=150)].T, alpha=0.01, color='k');
ax.set_ylim(0, 1.5)
fig.tight_layout()

mean_predictions = predictions.predictions['obs'].mean(axis=0).mean(axis=0).values

fig, ax = plt.subplots()

df.plot(y='M3', ax=ax)
ax.plot(df.index, 
        mean_predictions)

ax.set_title("daily water consumption (m³)")

Text(0.5, 1.0, 'daily water consumption (m³)')

s = pd.Series(tau_samples).value_counts()

df.index[s.index[0]]

Timestamp('2023-10-05 00:00:00')

	M3	RELEVÉ
JOUR
2022-07-12	0.362	405.939
2022-07-13	0.132	406.071
2022-07-14	0.313	406.384
2022-07-15	0.467	406.851
2022-07-16	0.433	407.284

Do I have a water leak? A bayesian answer

Qualitative data analysis¶

Linear model slope analysis¶

Bayesian modelling of the water leak: static model¶

Bayesian modelling of the water leak: dynamic time model¶

Comments