DAU and retention analysis
Comparing test and control groups
SELECT date_trunc('day', e.occurred_at),
CASE WHEN flag = 'true'
THEN 'treatment' ELSE 'control' END as flag,
COUNT(e.event_name)
FROM tutorial.yammer_events e
WHERE e.event_name = 'login'
GROUP BY 1,2
ORDER BY 1 DESC, 2 DESC
Click through rates
SELECT date_trunc('day', occurred_at) as day,
1.00 * COUNT (CASE WHEN action = 'email_clickthrough' THEN user_id ELSE NULL END)
/ COUNT (CASE WHEN action = 'email_open' THEN user_id ELSE NULL END) as CTR,
COUNT (CASE WHEN action = 'email_clickthrough' THEN user_id ELSE NULL END) as clickthroughs,
COUNT (CASE WHEN action = 'email_open' THEN user_id ELSE NULL END) as opens
FROM tutorial.yammer_emails
GROUP BY 1
---
WITH open as (
SELECT date_trunc('day', occurred_at) as day,
COUNT(action) as opens
FROM tutorial.yammer_emails
WHERE action = 'email_open'
GROUP BY 1
),
clickthrough as (
SELECT date_trunc('day', occurred_at) as day,
COUNT(action) as clickthroughs
FROM tutorial.yammer_emails
WHERE action = 'email_clickthrough'
GROUP BY 1
)
SELECT clickthrough.day,
1.00*clickthroughs/opens as CTR,
clickthroughs,
opens
FROM clickthrough
JOIN open ON clickthrough.day = open.day
ORDER BY 1 DESC
DAU, WAU, MAU, and ratios between them
WITH dailies AS (
SELECT DATE_TRUNC('day', e.occurred_at) as date,
COUNT(DISTINCT e.user_id) as dau
FROM tutorial.yammer_events e
WHERE e.event_name = 'login'
GROUP BY 1
)
SELECT d,
dau,
(SELECT COUNT(DISTINCT e.user_id) as wau
FROM tutorial.yammer_events e
WHERE e.occurred_at::DATE BETWEEN
dailies.date - 7 * Interval '1 day' AND dailies.date
AND e.event_name = 'login'
) as wau_count,
(SELECT COUNT(DISTINCT e.user_id) as mau
FROM tutorial.yammer_events e
WHERE e.occurred_at::DATE BETWEEN
dailies.date - 30 * Interval '1 day' AND dailies.date
AND e.event_name = 'login'
) as mau_count,
100.00 * dau/(SELECT COUNT(DISTINCT e.user_id) as mau
FROM tutorial.yammer_events e
WHERE e.occurred_at::DATE BETWEEN
dailies.date - 30 * Interval '1 day' AND dailies.date
AND e.event_name = 'login'
) as dau_mau
FROM dailies
UID, first_active_date, last_active_date, previous_active_date
Retention
with monthly_activity as (
select distinct
date_trunc('month', created_at) as month,
user_id
from events
)
select
this_month.month,
count(distinct user_id)
from monthly_activity this_month
join monthly_activity last_month
on this_month.user_id = last_month.user_id
and this_month.month = add_months(last_month.month,1)
group by month
Churn
with monthly_activity as (
select distinct
date_trunc('month', created_at) as month,
user_id
from events
)
select
last_month.month + add_months(last_month.month,1),
count(distinct last_month.user_id)
from monthly_activity last_month
left join monthly_activity this_month
on this_month.user_id = last_month.user_id
and this_month.month = add_months(last_month.month,1)
where this_month.user_id is null
group by 1
Reactivated Users
with
monthly_activity as (
select distinct
date_trunc('month', created_at) as month,
user_id
from events
),
first_activity as (
select user_id, date(min(created_at)) as month
from events
group by 1
)
select
this_month.month,
count(distinct user_id)
from monthly_activity this_month
left join monthly_activity last_month
on this_month.user_id = last_month.user_id
and this_month.month = add_months(last_month.month,1)
join first_activity
on this_month.user_id = first_activity.user_id
and first_activity.month != this_month.month
where last_month.user_id is null
group by 1
Percent Change
with monthly_active_users as (
select
date_trunc('month', created_at) as month,
count (distinct user_id) as mau
from events
group by 1
)
select
this_month.month,
[(this_month.mau - last_month.mau)*1.0/last_month.mau:%] as pct_change
from monthly_active_users this_month
join monthly_active_users last_month
on this_month.month = add_months(last_month.month,1)
Sessionization
SELECT *
, extract(epoch from mytimestamp)
- lag(extract(epoch from mytimestamp))
over (PARTITION BY user_id order by mytimestamp) as time_interval
FROM toy_data_psql;
SELECT *
, CASE
WHEN EXTRACT(EPOCH FROM mytimestamp)
- LAG(EXTRACT(EPOCH FROM mytimestamp))
OVER (PARTITION BY user_id ORDER BY mytimestamp) >= 30 * 60
THEN 1
ELSE 0
END as new_session
FROM
toy_data_psql;
SELECT *
, user_id || '_' || SUM(new_session)
OVER (PARTITION BY user_id ORDER BY mytimestamp) AS session_id
FROM (
SELECT *
, CASE
WHEN EXTRACT(EPOCH FROM mytimestamp)
- LAG(EXTRACT(EPOCH FROM mytimestamp))
OVER (PARTITION BY user_id ORDER BY mytimestamp) >= 30 * 60
THEN 1
ELSE 0
END as new_session
FROM
toy_data_psql
) s1
import dataiku
import pandas as pd
from datetime import timedelta
# define treshold value
T = timedelta(seconds=30*60)
# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()
# add a column containing previous timestamp
toy_data = pd.concat([toy_data,
toy_data.groupby('user_id').transform(lambda x:x.shift(1))]
,axis=1)
toy_data.columns = ['user_id','mytimestamp','prev_mytimestamp']
# create the new session column
toy_data['new_session'] = ((toy_data['mytimestamp']
- toy_data['prev_mytimestamp'])>=T).astype(int)
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_'
+ toy_data['increment'].astype(str)
# to get the same result as with hive/postgresql
toy_data = toy_data.sort(['user_id','mytimestamp'])