DAU and retention analysis

Comparing test and control groups

SELECT date_trunc('day', e.occurred_at),
    CASE WHEN flag = 'true' 
      THEN 'treatment' ELSE 'control' END as flag,
    COUNT(e.event_name)
FROM tutorial.yammer_events e
WHERE e.event_name = 'login'
GROUP BY 1,2
ORDER BY 1 DESC, 2 DESC

Click through rates

SELECT date_trunc('day', occurred_at) as day,
  1.00 * COUNT (CASE WHEN action = 'email_clickthrough' THEN user_id ELSE NULL END) 
        / COUNT (CASE WHEN action = 'email_open' THEN user_id ELSE NULL END) as CTR,
  COUNT (CASE WHEN action = 'email_clickthrough' THEN user_id ELSE NULL END) as clickthroughs,
  COUNT (CASE WHEN action = 'email_open' THEN user_id ELSE NULL END) as opens
FROM tutorial.yammer_emails
GROUP BY 1
---

WITH open as (
  SELECT date_trunc('day', occurred_at) as day,
         COUNT(action) as opens
  FROM tutorial.yammer_emails
  WHERE action = 'email_open'
  GROUP BY 1
),
  clickthrough as (
  SELECT date_trunc('day', occurred_at) as day,
         COUNT(action) as clickthroughs
  FROM tutorial.yammer_emails
  WHERE action = 'email_clickthrough'
  GROUP BY 1
)

SELECT clickthrough.day,
       1.00*clickthroughs/opens as CTR, 
       clickthroughs, 
       opens
FROM clickthrough
JOIN open ON clickthrough.day = open.day
ORDER BY 1 DESC

DAU, WAU, MAU, and ratios between them

WITH dailies AS (
  SELECT DATE_TRUNC('day', e.occurred_at) as date,
       COUNT(DISTINCT e.user_id) as dau
  FROM tutorial.yammer_events e
  WHERE e.event_name = 'login'
  GROUP BY 1 
)
SELECT d, 
    dau, 
    (SELECT COUNT(DISTINCT e.user_id) as wau
          FROM tutorial.yammer_events e
          WHERE e.occurred_at::DATE BETWEEN 
            dailies.date - 7 * Interval '1 day' AND dailies.date
          AND e.event_name = 'login'
          ) as wau_count,
    (SELECT COUNT(DISTINCT e.user_id) as mau
          FROM tutorial.yammer_events e
          WHERE e.occurred_at::DATE BETWEEN 
            dailies.date - 30 * Interval '1 day' AND dailies.date
          AND e.event_name = 'login'
          ) as mau_count,
    100.00 * dau/(SELECT COUNT(DISTINCT e.user_id) as mau
          FROM tutorial.yammer_events e
          WHERE e.occurred_at::DATE BETWEEN 
            dailies.date - 30 * Interval '1 day' AND dailies.date
          AND e.event_name = 'login'
          ) as dau_mau
FROM dailies

UID, first_active_date, last_active_date, previous_active_date

Retention

with monthly_activity as (
  select distinct
    date_trunc('month', created_at) as month,
    user_id
  from events
)
select
  this_month.month,
  count(distinct user_id)
from monthly_activity this_month
join monthly_activity last_month
  on this_month.user_id = last_month.user_id
  and this_month.month = add_months(last_month.month,1)
group by month

Churn

with monthly_activity as (
  select distinct
    date_trunc('month', created_at) as month,
    user_id
  from events
)
select
  last_month.month + add_months(last_month.month,1),
  count(distinct last_month.user_id)
from monthly_activity last_month
left join monthly_activity this_month
  on this_month.user_id = last_month.user_id
  and this_month.month = add_months(last_month.month,1)
where this_month.user_id is null
group by 1

Reactivated Users

with
monthly_activity as (
  select distinct
    date_trunc('month', created_at) as month,
    user_id
  from events
),
first_activity as (
  select user_id, date(min(created_at)) as month
  from events
  group by 1
)
select
  this_month.month,
  count(distinct user_id)
from monthly_activity this_month
left join monthly_activity last_month
  on this_month.user_id = last_month.user_id
  and this_month.month = add_months(last_month.month,1)
join first_activity
  on this_month.user_id = first_activity.user_id
  and first_activity.month != this_month.month
where last_month.user_id is null
group by 1

Percent Change

with monthly_active_users as (
 select
   date_trunc('month', created_at) as month,
   count (distinct user_id) as mau
 from events
 group by 1
)
select
 this_month.month,
 [(this_month.mau - last_month.mau)*1.0/last_month.mau:%] as pct_change
from monthly_active_users this_month
join monthly_active_users last_month
 on this_month.month = add_months(last_month.month,1)

Sessionization

SELECT *
      ,  extract(epoch from mytimestamp)
         - lag(extract(epoch from mytimestamp))
         over (PARTITION BY user_id order by mytimestamp) as time_interval
FROM toy_data_psql;

SELECT *
  , CASE
      WHEN EXTRACT(EPOCH FROM mytimestamp)
           - LAG(EXTRACT(EPOCH FROM mytimestamp))
           OVER (PARTITION BY user_id ORDER BY mytimestamp) >= 30 * 60
      THEN 1
      ELSE 0
    END as new_session
FROM
  toy_data_psql;

SELECT *
  , user_id || '_' || SUM(new_session)
  OVER (PARTITION BY user_id ORDER BY mytimestamp) AS session_id
FROM (
  SELECT *
    , CASE
       WHEN EXTRACT(EPOCH FROM mytimestamp)
          - LAG(EXTRACT(EPOCH FROM mytimestamp))
            OVER (PARTITION BY user_id ORDER BY mytimestamp) >= 30 * 60
       THEN 1
       ELSE 0
      END as new_session
    FROM
      toy_data_psql
) s1
import dataiku
import pandas as pd
from datetime import timedelta

# define treshold value
T = timedelta(seconds=30*60)

# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()

# add a column containing previous timestamp
toy_data =  pd.concat([toy_data,
                       toy_data.groupby('user_id').transform(lambda x:x.shift(1))]
                      ,axis=1)
toy_data.columns = ['user_id','mytimestamp','prev_mytimestamp']

# create the new session column
toy_data['new_session'] = ((toy_data['mytimestamp']
                            - toy_data['prev_mytimestamp'])>=T).astype(int)

# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_'
                                + toy_data['increment'].astype(str)

# to get the same result as with hive/postgresql
toy_data = toy_data.sort(['user_id','mytimestamp'])