Source code for gnt.score

"""score module. Functions to score genetic interactions"""
import pandas as pd
import statsmodels.api as sm
import numpy as np
import warnings


[docs]def check_guide_input(df):
    """Check whether input DataFrame has expected format

    - Column 1: first guide identifier
    - Column 2: second guide identifier
    - Column 3: first gene identifier
    - Column 4: second gene identifier
    - Column 5+: LFC values from different conditions

    Parameters
    ----------
    df: DataFrame
        LFC data with the specified columns
    """
    if df.shape[1] < 5:
        raise ValueError('Input has ' + str(df.shape[1]) + ' columns, should be > 4')
    if df.shape[0] == 0:
        raise ValueError('Input has no rows')
    if df.iloc[:, [0, 1]].drop_duplicates().shape[0] != df.shape[0]:
        raise ValueError('The first two columns of input (guide 1 and guide 2) should uniquely identify each row')


[docs]def melt_df(df, id_cols=None, var_name='condition', value_name='lfc'):
    """Helper function to melt a DataFrame

    Parameters
    ----------
    df: DataFrame
        log fold chang dataframe
    id_cols: list, optional
        Specify id columns. If none, then the first four columns are used
    var_name: str, optional
        New variable name
    value_name: str, optional
        New value name.

    Returns
    -------
    DataFrame
    """
    if id_cols is None:
        id_cols = df.columns[0:4]  # guide1, guide2, gene1, gene2
    melted_df = df.melt(id_vars=id_cols, var_name=var_name, value_name=value_name)
    return melted_df


[docs]def order_cols(df, cols, name):
    """Reorder values in columns to be in alphabetical order

    Parameters
    ----------
    df: DataFrame
        DataFrame with at least two columns to be reodredered in alphabetical order
    cols: List
        Indices of columns to be reordered
    name: str
        Name of reordered column

    Returns
    -------
    DataFrame
        With columns in alphabetical order
    """
    col1 = df.columns[cols[0]]
    col2 = df.columns[cols[1]]
    two_col_df = df[[col1, col2]].drop_duplicates()
    two_col_df[name + '_a'] = two_col_df.apply(lambda row: (row[col1] if row[col1] <= row[col2]
                                                            else row[col2]), axis=1)
    two_col_df[name + '_b'] = two_col_df.apply(lambda row: (row[col2] if row[col1] <= row[col2]
                                                            else row[col1]), axis=1)
    ordered_df = df.merge(two_col_df, how='inner', on=[col1, col2])
    return ordered_df


[docs]def order_cols_with_meta(df, cols, meta_cols, col_name, meta_name):
    """Reorder values in columns to be in alphabetical order, keeping track of columns with
    meta-information

    Parameters
    ----------
    df: DataFrame
        DataFrame with at least two columns to be reordered in alphabetical order
    cols: List
        Indices of columns to be reordered
    meta_cols: List
        Indices of columns with meta information, ordered the same as cols
    col_name: str
        Base name of reordered columns
    meta_name: str
        Base name of meta information columns

    Returns
    -------
    DataFrame
        With columns in alphabetical order and their respective meta info
    """
    col1 = df.columns[cols[0]]
    meta_col1 = df.columns[meta_cols[0]]
    col2 = df.columns[cols[1]]
    meta_col2 = df.columns[meta_cols[1]]
    four_col_df = df[[col1, col2, meta_col1, meta_col2]].drop_duplicates()
    four_col_df[col_name + '_a'] = four_col_df.apply(lambda row: (row[col1] if row[col1] <= row[col2]
                                                                  else row[col2]), axis=1)
    four_col_df[col_name + '_a_' + meta_name] = four_col_df.apply(lambda row: (row[meta_col1] if row[col1] <= row[col2]
                                                                  else row[meta_col2]), axis=1)
    four_col_df[col_name + '_b'] = four_col_df.apply(lambda row: (row[col2] if row[col1] <= row[col2]
                                                                  else row[col1]), axis=1)
    four_col_df[col_name + '_b_' + meta_name] = four_col_df.apply(lambda row: (row[meta_col2] if row[col1] <= row[col2]
                                                                  else row[meta_col1]), axis=1)
    ordered_df = df.merge(four_col_df, how='inner', on=[col1, meta_col1, col2, meta_col2])
    return ordered_df


[docs]def aggregate_guide_lfcs(df):
    agg_lfcs = (df.groupby(['guide_a', 'guide_b', 'guide_a_gene', 'guide_b_gene', 'condition'])
                .agg({'lfc': 'mean'})
                .reset_index())
    return agg_lfcs


[docs]def build_anchor_df(df):
    """Build a dataframe where each guide is defined as an anchor and is paired with all other target guides

    Parameters
    ----------
    df: DataFrame
        LFC df

    Returns
    -------
    DataFrame
    """
    df_columns = df.columns
    forward_df = (df
                  .rename({df_columns[0]: 'anchor_guide', df_columns[1]: 'target_guide',
                           df_columns[2]: 'anchor_gene', df_columns[3]: 'target_gene'}, axis=1))
    reverse_df = (df
                  .rename({df_columns[1]: 'anchor_guide', df_columns[0]: 'target_guide',
                           df_columns[3]: 'anchor_gene', df_columns[2]: 'target_gene'}, axis=1))
    anchor_df = (pd.concat([forward_df, reverse_df])
                 .drop_duplicates()  # in case where guide1==guide2
                 .reset_index(drop=True))
    return anchor_df


[docs]def get_base_score(df, ctl_genes):
    """Get LFC of each guide when paired with controls

    Parameters
    ----------
    df: DataFrame
        Anchor guides paired with all of their target guides. Has columns "anchor_guide" and "lfc"
    ctl_genes: list
        Control genes

    Returns
    -------
    DataFrame
        Median LFC for each anchor guide
    """
    base_score = (df[df.target_gene.isin(ctl_genes)]
                  .groupby(['anchor_guide', 'condition'])
                  .agg({'lfc': 'median'})
                  .reset_index())
    return base_score


[docs]def get_no_control_guides(df, guide_base_score):
    """Get guides that are not paired with controls

    Parameters
    ----------
    df: DataFrame
        DataFrame with the column anchor_gene
    guide_base_score: DataFrame
        Guide scores when paired with controls

    Returns
    -------
    list
        Guides without control pairs
    """
    starting_guides = set(df.anchor_guide.to_list())
    base_score_guides = set(guide_base_score.anchor_guide.to_list())
    no_control_guides = list(starting_guides - base_score_guides)
    return no_control_guides


[docs]def get_removed_guides_genes(anchor_df, guides):
    """Get dataframe of removed guides and genes

    Parameters
    ----------
    anchor_df: DataFrame
    guides: list
        List of guides being removed

    Returns
    -------
    DataFrame
    """
    remove_anchor = (anchor_df.loc[anchor_df.anchor_guide.isin(guides),
                                   ['anchor_guide', 'anchor_gene']]
                     .rename({'anchor_guide': 'guide', 'anchor_gene': 'gene'}, axis=1)
                     .drop_duplicates())
    return remove_anchor


[docs]def warn_no_control_guides(anchor_df, no_control_guides):
    """Give warning for guides with control pairs

    Parameters
    ----------
    anchor_df: DataFrame
    no_control_guides: list

    Warnings
    --------
    Guides without control pairs
    """
    if len(no_control_guides) > 0:
        removed_genes = get_removed_guides_genes(anchor_df, no_control_guides)
        warnings.warn('There are ' + str(len(no_control_guides)) + ' guides without control pairs:\n' +
                      str(removed_genes), stacklevel=2)


[docs]def join_anchor_base_score(anchor_df, base_df):
    """Join anchor DataFrame with Base LFCs on anchor_guide

    Parameters
    ----------
    anchor_df: DataFrame
        DataFrame with anchor and target guides
    base_df: DataFrame
        Base LFC DataFrame

    Returns
    -------
    DataFrame
    """
    joined_df = (anchor_df.merge(base_df, how='inner', left_on=['target_guide', 'condition'],
                                 right_on=['anchor_guide', 'condition'], suffixes=['', '_target'])
                 .drop('anchor_guide_target', axis=1))
    return joined_df


[docs]def check_min_guide_pairs(df, min_pairs):
    """Check that each guide is paired with a minimum number of guides

    Parameters
    ----------
    df: DataFrame
        Anchor df with column anchor_guide
    min_pairs: int
        minimum number of guides to be paired with

    Returns
    -------
    List
        guides that are not paired fewer than min_pairs number of guides
    """
    pair_count = (df[['anchor_guide', 'target_guide']]
                  .drop_duplicates()
                  .groupby('anchor_guide')
                  .apply(lambda d: d.shape[0])
                  .reset_index(name='n'))
    guides_no_pairs = pair_count.anchor_guide[~(pair_count.n >= min_pairs)].to_list()
    return guides_no_pairs


[docs]def remove_guides(df, rm_guides):
    """Remove guides from DataFrame

    Parameters
    ----------
    df: DataFrame
        Dataframe with columns for guides
    rm_guides: list
        List of guides to remove

    Returns
    -------
    DataFrame
        Filtered to remove guides
    """
    df = df[~(df.anchor_guide.isin(rm_guides))]
    return df


[docs]def filter_anchor_base_scores(df, min_pairs):
    """Filter guides that are not in with a certain number of pairs

    Parameters
    ----------
    df: DataFrame
        DataFrame with column anchor_guide
    min_pairs: int
        Minimum number of pairs for an anchor_guide

    Returns
    -------
    DataFrame
        Filtered dataframe if there are guides without the minimum number of guide pairs
    """
    guides_no_pairs = check_min_guide_pairs(df, min_pairs)
    if len(guides_no_pairs) > 0:
        removed_genes = get_removed_guides_genes(df, guides_no_pairs)
        warnings.warn('Removed ' + str(len(guides_no_pairs)) + ' guides with fewer than ' +
                      str(min_pairs) + ' pairs:\n' + str(removed_genes), stacklevel=2)
        filtered_anchor_base_scores = remove_guides(df, guides_no_pairs)
        return filtered_anchor_base_scores
    return df


[docs]def model_fixed_slope(train_x, train_y, test_x, slope=1):
    """Predict guide phenotype using fixed slope

    From: https://stackoverflow.com/questions/33292969/linear-regression-with-specified-slope

    Parameters
    ----------
    train_x: Series
        Single gene phenotype for training genes
    train_y: Series
        Pair phenotype for training genes
    test_x: Series
        Single gene phenotype for testing genes
    slope: int
        Slope to fit model

    Returns
    -------
    Series
        Predicted phenotype of pair
    DataFrame
        Information about model
    """
    intercept = np.mean(train_y - train_x*slope)
    model_info = {'model': 'fixed_slope', 'const': intercept}
    predictions = test_x*slope + intercept
    return predictions, model_info


[docs]def model_spline(train_x, train_y, test_x, deg):
    """Predict guide phenotype using a natural cubic spline

    Parameters
    ----------
    train_x: Series
        Single gene phenotype for training genes
    train_y: Series
        Pair phenotype for training genes
    test_x: Series
        Single gene phenotype for testing genes
    deg: int
        Degrees of freedom for spline model

    Returns
    -------
    Series
        Predicted phenotype of pair
    DataFrame
        Information about GLM model
    """
    train_x = train_x.rename('x', axis=1)
    train_y = train_y.rename('y', axis=1)
    train_df = pd.concat([train_x, train_y], axis=1)
    model_fit = sm.formula.ols("y ~ cr(x, df=" + str(deg) + ", constraints='center')", data=train_df).fit()
    model_info = {'model': 'spline', 'const': model_fit.params.xs('Intercept')}
    test_x = test_x.rename('x')
    predictions = model_fit.predict(test_x)
    return predictions, model_info


[docs]def model_linear(train_x, train_y, test_x):
    """Predict guide phenotype using a linear model and ordinary least squares

    Parameters
    ----------
    train_x: Series
        Single gene phenotype for training genes
    train_y: Series
        Pair phenotype for training genes
    test_x: Series
        Single gene phenotype for testing genes

    Returns
    -------
    Series
        Predicted phenotype of pair
    DataFrame
        Information about OLS model
    """
    train_x = sm.add_constant(train_x)
    model_fit = sm.OLS(train_y, train_x).fit()
    model_info = {'model': 'linear', 'R2': model_fit.rsquared, 'f_pvalue': model_fit.f_pvalue,
                  'const': model_fit.params.const, 'beta': model_fit.params.values[1]}
    predictions = model_fit.predict(sm.add_constant(test_x))
    return predictions, model_info


[docs]def model_quadratic(train_x, train_y, test_x):
    """Predict guide phenotype using a linear model and ordinary least squares

    Parameters
    ----------
    train_x: Series
        Single gene phenotype for training genes
    train_y: Series
        Pair phenotype for training genes
    test_x: Series
        Single gene phenotype for testing genes

    Returns
    -------
    Series
        Predicted phenotype of pair
    DataFrame
        Information about OLS model
    """
    train_x = train_x.rename('x', axis=1)
    train_x = sm.add_constant(train_x)
    train_df = train_x.copy()
    train_df['y'] = train_y
    model_fit = sm.formula.ols('y ~ np.power(x, 2) + x + const', data=train_df).fit()
    model_info = {'model': 'quadratic', 'R2': model_fit.rsquared, 'f_pvalue': model_fit.f_pvalue,
                  'const': model_fit.params.const}
    test_x = test_x.rename('x')
    test_x = sm.add_constant(test_x)
    predictions = model_fit.predict(test_x)
    return predictions, model_info


[docs]def fit_anchor_model(df, fit_genes, model, deg, x_col='lfc_target', y_col='lfc'):
    """Fit linear model for a single anchor guide paired with all target guides in a condition

    Parameters
    ----------
    df: DataFrame
        LFCs for a single anchor anchor guide
    fit_genes: list
        Genes used to fit the linear model
    model: str
        Name of model used to fit x and y data
    x_col: str, optional
        X column to fit model
    y_col: str, optional
        Y column to fit model
    deg: int
        Degrees of freedom for spline model

    Returns
    -------
    DataFrame:
        Guide level residuals
    DataFrame:
        R^2 for model
    """
    if fit_genes is not None:
        train_df = df.loc[df.target_gene.isin(fit_genes), :].copy()
    else:
        train_df = df
    train_x = train_df[x_col].copy()
    train_y = train_df[y_col].copy()
    test_x = df[x_col].copy()
    test_y = df[y_col].copy()
    if model == 'linear':
        predictions, model_info = model_linear(train_x, train_y, test_x)
    elif model == 'fixed slope':
        predictions, model_info = model_fixed_slope(train_x, train_y, test_x)
    elif model == 'spline':
        predictions, model_info = model_spline(train_x, train_y, test_x, deg)
    elif model == 'quadratic':
        predictions, model_info = model_quadratic(train_x, train_y, test_x)
    else:
        raise ValueError('Model ' + model + ' not implemented')
    out_df = df.copy()
    out_df['prediction'] = predictions
    out_df['residual'] = test_y - predictions
    out_df['residual_z'] = (out_df['residual'] - out_df['residual'].mean())/out_df['residual'].std()
    return out_df, model_info


[docs]def fit_models(df, fit_genes, model, deg):
    """Loop through anchor guides and fit a linear model

    Parameters
    ----------
    df: DataFrame
        LFCs for all anchor guides
    fit_genes: list
        Genes used to fit the linear model
    model: str
        Name of model used to fit x and y data
    deg: int
        Degrees of freedom for spline model

    Returns
    -------
    DataFrame:
        Guide level residuals
    DataFrame:
        R^2 for each model
    """
    model_info_list = []
    residual_list = []
    for guide_condition, group_df in df.groupby(['anchor_guide', 'condition']):
        residuals, model_info = fit_anchor_model(group_df, fit_genes, model, deg)
        residual_list.append(residuals)
        model_info['anchor_guide'] = guide_condition[0]
        model_info['anchor_gene'] = group_df['anchor_gene'].values[0]
        model_info['condition'] = guide_condition[1]
        model_info_list.append(model_info)
    model_info_df = pd.DataFrame(model_info_list)
    residual_df = (pd.concat(residual_list, axis=0)
                   .reset_index(drop=True))
    return residual_df, model_info_df


[docs]def get_guide_residuals(lfc_df, ctl_genes, fit_genes=None,
                        min_pairs=5, model='linear', deg=6):
    """Calculate guide-level residuals

    Parameters
    ----------
    lfc_df: DataFrame
        LFCs from combinatorial screen
        * Column 1 - first guide identifier
        * Column 2 - second guide identifier
        * Column 3 - first gene identifier
        * Column 4 - second gene identifier
        * Column 5+ - LFC values from different conditions
    ctl_genes: list
        Negative control genes (e.g. nonessential, intronic, or no site)
    fit_genes: list, optional
        Genes used to train each linear model. If None, uses all genes to fit. This can be helpful if we expect
        a large fraction of target_genes to be interactors
    min_pairs: int, optional
        Minimum number of pairs a guide must be in
    model: str, optional
        Name of model to fit to data
    deg: int, optional
        Degrees of freedom for spline model. Ignored if model is not spline

    Returns
    -------
    DataFrame
        Guide level residuals
    DataFrame
        R-squared and f-statistic p-value for each linear model
    """
    check_guide_input(lfc_df)
    melted_lfc_df = melt_df(lfc_df)
    reordered_guides = order_cols_with_meta(melted_lfc_df, [0, 1], [2, 3], 'guide', 'gene')
    dedup_guide_lfcs = aggregate_guide_lfcs(reordered_guides)
    melted_anchor_df = build_anchor_df(dedup_guide_lfcs)
    guide_base_score = get_base_score(melted_anchor_df, ctl_genes)
    no_control_guides = get_no_control_guides(melted_anchor_df, guide_base_score)
    warn_no_control_guides(melted_anchor_df, no_control_guides)
    anchor_base_scores = join_anchor_base_score(melted_anchor_df, guide_base_score)
    filtered_anchor_base_scores = filter_anchor_base_scores(anchor_base_scores, min_pairs)
    guide_residuals, model_info_df = fit_models(filtered_anchor_base_scores, fit_genes, model, deg)
    return guide_residuals, model_info_df


[docs]def get_pop_stats(df, stat):
    """Get mean and standard deviation for a stat

    Parameters
    ----------
    df: DataFrame
        Guide level scores
    stat: str
        Column to caclulate statistics from

    Returns
    -------
    DataFrame
        Mean and standard deviation of the specified stat
    """
    pop_stats = (df.groupby('condition')
                 .agg(mean_stat=(stat, 'mean'),
                      std_stat=(stat, 'std'))
                 .reset_index())
    return pop_stats


[docs]def combine_z_scores(df, stat):
    """Combine z-score for a gene pair

    Sum z-scores and divide by the square root of the number of observations

    Parameters
    ----------
    df: DataFrame
        guide level statistic
    stat: str
        statistic to combine

    Returns
    -------
    DataFrame
        z_score for gene combination
    """
    guide1 = df.columns[0]
    guide2 = df.columns[1]
    combo_stats = (df.groupby(['condition', 'gene_a', 'gene_b'])
                   .agg(sum_stat=(stat, 'sum'))
                   .reset_index())
    guide_pair_df = (df.groupby(['condition', 'gene_a', 'gene_b'])
                     .apply(lambda d: len({frozenset(x) for x in zip(d[guide1], d[guide2])}))  # use sets to count
                     # # unique guide pairs
                     .reset_index(name='guide_pairs'))
    combo_stats = combo_stats.merge(guide_pair_df, how='inner', on=['condition', 'gene_a', 'gene_b'])
    combo_stats['pair_z_score'] = (combo_stats['sum_stat'] / np.sqrt(combo_stats['guide_pairs']))
    return combo_stats[['condition', 'gene_a', 'gene_b', 'guide_pairs', 'pair_z_score']]


[docs]def get_avg_score(df, score):
    """Get avg score for gene pairs

    Parameters
    ----------
    df: DataFrame
        Guide-level DataFrame
    score: str
        Column to average

    Returns
    -------
    DataFrame
        Average score for gene pairs
    """
    avg_score = (df.groupby(['condition', 'gene_a', 'gene_b'])
                 .agg({score: 'mean'})
                 .reset_index())
    return avg_score


[docs]def get_base_lfc_from_resid(residual_df):
    """Calculate gene level base lfcs from the guide-level residual dataframe

    Parameters
    ----------
    residual_df: DataFrame
        DataFrame of residuals

    Returns
    -------
    DataFrame
        With columns gene, condition, base_lfc
    """
    gene_base_lfcs = (residual_df[['target_gene', 'condition', 'lfc_target']]
                      .drop_duplicates()
                      .groupby(['target_gene', 'condition'])
                      .agg({'lfc_target': 'mean'})
                      .reset_index()
                      .rename({'lfc_target': 'base_lfc', 'target_gene': 'gene'}, axis=1))
    return gene_base_lfcs


[docs]def get_gene_residuals(guide_residuals, stat='residual_z'):
    """Combine residuals at the gene level

    Parameters
    ----------
    guide_residuals: DataFrame
        Guide level residuals
    stat: str, optional
        Statistic to combine at the gene level

    Returns
    -------
    DataFrame
        Gene level z_scores
    """
    ordered_df = order_cols(guide_residuals, [2, 3], 'gene')
    gene_a_anchor_z = combine_z_scores(ordered_df[ordered_df.gene_a == ordered_df.anchor_gene], stat)
    gene_b_anchor_z = combine_z_scores(ordered_df[ordered_df.gene_b == ordered_df.anchor_gene], stat)
    avg_lfc = get_avg_score(ordered_df, 'lfc')
    base_lfcs = get_base_lfc_from_resid(guide_residuals)
    gene_results = (avg_lfc.merge(base_lfcs, how='inner', left_on=['condition', 'gene_a'],
                                  right_on=['condition', 'gene'])
                    .drop('gene', axis=1)
                    .merge(base_lfcs, how='inner', left_on=['condition', 'gene_b'],
                           right_on=['condition', 'gene'], suffixes=['_a', '_b'])
                    .drop('gene', axis=1)
                    .merge(gene_a_anchor_z, how='inner',
                           on=['condition', 'gene_a', 'gene_b'])
                    .merge(gene_b_anchor_z, how='inner',
                           on=['condition', 'gene_a', 'gene_b', 'guide_pairs'], suffixes=['_gene_a_anchor',
                                                                                          '_gene_b_anchor']))
    gene_results['pair_z_score'] = (gene_results[['pair_z_score_gene_a_anchor', 'pair_z_score_gene_b_anchor']]
                                    .sum(axis=1) / np.sqrt(2))
    return gene_results


[docs]def calc_dlfc(df, base_lfcs):
    """Add together base lfcs to generate an expectation for each guide pair

    Parameters
    ----------
    df: DataFrame
        Combo level LFCs
    base_lfcs: DataFrame
        Base LFCs - single guide phenotype

    Returns
    -------
    DataFrame
        delta LFCs for each guide pair
    """
    guide1 = df.columns[0]
    guide2 = df.columns[1]
    dlfc = (df.merge(base_lfcs, how='inner', left_on=[guide1, 'condition'],
                     right_on=['anchor_guide', 'condition'], suffixes=['', '_' + guide1 + '_base'])
            .drop('anchor_guide', axis=1)
            .merge(base_lfcs, how='inner', left_on=[guide2, 'condition'],
                   right_on=['anchor_guide', 'condition'], suffixes=['', '_' + guide2 + '_base'])
            .drop('anchor_guide', axis=1))
    dlfc['sum_lfc'] = dlfc['lfc_' + guide1 + '_base'] + dlfc['lfc_' + guide2 + '_base']
    dlfc['dlfc'] = dlfc['lfc'] - dlfc['sum_lfc']
    dlfc['dlfc_z'] = (dlfc.groupby('condition')
                      .dlfc
                      .transform(lambda x: (x - x.mean())/x.std()))
    return dlfc


[docs]def get_guide_dlfcs(lfc_df, ctl_genes):
    """Calculate delta-LFC's at the guide level

    Model the LFC of each combination as the sum of each guide when paired with controls. The difference from this
    expectation is the delta log2-fold change

    Parameters
    ----------
    lfc_df: DataFrame
        LFCs from combinatorial screen

        - Column 1: first guide identifier
        - Column 2: second guide identifier
        - Column 3: first gene identifier
        - Column 4: second gene identifier
        - Column 5+: LFC values from different conditions
    ctl_genes: list
        Negative control genes (e.g. nonessential, intronic, or no site)

    Returns
    -------
    DataFrame:
        delta LFCs for guide pairs
    """
    check_guide_input(lfc_df)
    #  get base lfcs using anchor framework
    melted_lfc_df = melt_df(lfc_df)
    reordered_guides = order_cols_with_meta(melted_lfc_df, [0, 1], [2, 3], 'guide', 'gene')
    dedup_guide_lfcs = aggregate_guide_lfcs(reordered_guides)
    melted_anchor_df = build_anchor_df(dedup_guide_lfcs)
    guide_base_score = get_base_score(melted_anchor_df, ctl_genes)
    no_control_guides = get_no_control_guides(melted_anchor_df, guide_base_score)
    #  calculate delta-log-fold changes
    warn_no_control_guides(melted_anchor_df, no_control_guides)
    melted_df = melt_df(lfc_df)
    guide_dlfc = calc_dlfc(melted_df, guide_base_score)
    return guide_dlfc


[docs]def get_base_lfc_from_dlfc(dlfc_df):
    """Calculate gene level base lfcs from the guide-level dLFC dataframe

    Parameters
    ----------
    dlfc_df: DataFrame
        DataFrame of delta log-fold changes

    Returns
    -------
    DataFrame
        With columns gene, condition, base_lfc
    """
    gene1 = dlfc_df.columns[2]
    gene2 = dlfc_df.columns[3]
    guide1_base = 'lfc_' + dlfc_df.columns[0] + '_base'
    guide2_base = 'lfc_' + dlfc_df.columns[1] + '_base'
    base_lfcs = (pd.concat([dlfc_df.rename({gene1: 'gene', guide1_base: 'base_lfc'}, axis=1),
                            dlfc_df.rename({gene2: 'gene', guide2_base: 'base_lfc'}, axis=1)])
                 .groupby(['condition', 'gene'])
                 .agg({'base_lfc': 'mean'})
                 .reset_index())
    return base_lfcs


[docs]def get_gene_dlfcs(guide_dlfcs, stat='dlfc_z'):
    """Combine dLFCs at the gene level

    Parameters
    ----------
    guide_dlfcs: DataFrame
        Guide level dLFCs
    stat: str, optional
        Statistic to combine at the gene level

    Returns
    -------
    DataFrame
        Gene level z_scores
    """
    ordered_df = order_cols(guide_dlfcs, [2, 3], 'gene')
    combined_z = combine_z_scores(ordered_df, stat)
    avg_lfc = get_avg_score(ordered_df, 'lfc')
    base_lfcs = get_base_lfc_from_dlfc(guide_dlfcs)
    gene_results = (avg_lfc.merge(base_lfcs, how='inner', left_on=['condition', 'gene_a'],
                                  right_on=['condition', 'gene'])
                    .drop('gene', axis=1)
                    .merge(base_lfcs, how='inner', left_on=['condition', 'gene_b'],
                           right_on=['condition', 'gene'], suffixes=['_a', '_b'])
                    .drop('gene', axis=1)
                    .merge(combined_z, how='inner', on=['condition', 'gene_a', 'gene_b']))
    return gene_results