{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# BMI prediction using the COMBO dataset \n\nWe first consider the `COMBO data set <https://github.com/Leo-Simpson/c-lasso/tree/master/examples/COMBO_data>`_\nand show how to predict Body Mass Index (BMI) from microbial genus abundances and two non-compositional covariates  using \"filtered_data\".\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Import the package\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import sys, os\nfrom os.path import join, dirname, abspath\n\nclasso_dir = dirname(dirname(abspath(\"__file__\")))\nsys.path.append(classo_dir)\nfrom classo import classo_problem, clr\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Define how to read csv\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def csv_to_np(file, begin=1, header=None):\n    \"\"\"Function to read a csv file and to create an ndarray with this\n\n    Args:\n        file (str): Name of csv file\n        begin (int, optional): First colomn where it should read the matrix\n        header (None or int, optional): Same parameter as in the function :func:`pandas.read_csv`\n\n    Returns:\n        ndarray : matrix of the csv file\n    \"\"\"\n    tab1 = pd.read_csv(file, header=header)\n    return np.array(tab1)[:, begin:]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Load microbiome and covariate data X\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "data_dir = join(classo_dir, \"examples/COMBO_data\")\nX0 = csv_to_np(join(data_dir, \"complete_data/GeneraCounts.csv\"), begin=0).astype(float)\nX_C = csv_to_np(join(data_dir, \"CaloriData.csv\"), begin=0).astype(float)\nX_F = csv_to_np(join(data_dir, \"FatData.csv\"), begin=0).astype(float)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Load BMI measurements y\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "y = csv_to_np(join(data_dir, \"BMI.csv\"), begin=0).astype(float)[:, 0]\nlabels = csv_to_np(join(data_dir, \"complete_data/GeneraPhylo.csv\")).astype(str)[:, -1]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Normalize/transform data\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "y = y - np.mean(y)  # BMI data (n = 96)\nX_C = X_C - np.mean(X_C, axis=0)  # Covariate data (Calorie)\nX_F = X_F - np.mean(X_F, axis=0)  # Covariate data (Fat)\nX0 = clr(X0, 1 / 2).T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Set up design matrix and zero-sum constraints for 45 genera\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "X = np.concatenate(\n    (X0, X_C, X_F), axis=1\n)  # Joint microbiome and covariate data and offset\nlabel = np.concatenate([labels, np.array([\"Calorie\", \"Fat\"])])\nC = np.ones((1, len(X[0])))\nC[0, -1], C[0, -2] = 0.0, 0.0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Set up c-lassso problem\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "problem = classo_problem(X, y, C, label=label)\nproblem.formulation.intercept = True"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Use stability selection with theoretical lambda [Combettes & M\u00fcller, 2020b]\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "problem.model_selection.StabSelparameters.method = \"lam\"\nproblem.model_selection.StabSelparameters.threshold_label = 0.5"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Use formulation R3\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "problem.formulation.concomitant = True\n\nproblem.solve()\nprint(problem)\nprint(problem.solution)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Use formulation R4\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "problem.data.label = label\nproblem.formulation.huber = True\nproblem.formulation.concomitant = True\n\nproblem.solve()\nprint(problem)\nprint(problem.solution)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Use formulation R1 with ALO\nALO is implemented only for R1 without intercept for now.\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "problem.data.label = label\nproblem.formulation.intercept = False\nproblem.formulation.huber = False\nproblem.formulation.concomitant = False\nproblem.model_selection.ALO = True\n\nproblem.solve()\nprint(problem)\nprint(problem.solution)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}