import numpy as np
import statsmodels.api as sm

import pandas as pd

url = "https://stats.idre.ucla.edu/stat/data/hsb2.csv"
hsb2 = pd.read_table(url, delimiter=",")

hsb2.head(10)

hsb2.groupby("race")["write"].mean()

race
1    46.458333
2    58.000000
3    48.200000
4    54.055172
Name: write, dtype: float64

from patsy.contrasts import Treatment

levels = [1, 2, 3, 4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print(contrast.matrix)

[[0. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

hsb2.race.head(10)

0    4
1    4
2    4
3    4
4    4
5    4
6    3
7    1
8    4
9    3
Name: race, dtype: int64

print(contrast.matrix[hsb2.race - 1, :][:20])

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]

pd.get_dummies(hsb2.race.values, drop_first=False)

from statsmodels.formula.api import ols

mod = ols("write ~ C(race, Treatment)", data=hsb2)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.78e-05
Time:                        10:25:31   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                  46.4583      1.842     25.218      0.000      42.825      50.091
C(race, Treatment)[T.2]    11.5417      3.286      3.512      0.001       5.061      18.022
C(race, Treatment)[T.3]     1.7417      2.732      0.637      0.525      -3.647       7.131
C(race, Treatment)[T.4]     7.5968      1.989      3.820      0.000       3.675      11.519
==============================================================================
Omnibus:                       10.487   Durbin-Watson:                   1.779
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               11.031
Skew:                          -0.551   Prob(JB):                      0.00402
Kurtosis:                       2.670   Cond. No.                         8.25
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

from patsy.contrasts import ContrastMatrix


def _name_levels(prefix, levels):
    return ["[%s%s]" % (prefix, level) for level in levels]


class Simple(object):
    def _simple_contrast(self, levels):
        nlevels = len(levels)
        contr = -1.0 / nlevels * np.ones((nlevels, nlevels - 1))
        contr[1:][np.diag_indices(nlevels - 1)] = (nlevels - 1.0) / nlevels
        return contr

    def code_with_intercept(self, levels):
        contrast = np.column_stack(
            (np.ones(len(levels)), self._simple_contrast(levels))
        )
        return ContrastMatrix(contrast, _name_levels("Simp.", levels))

    def code_without_intercept(self, levels):
        contrast = self._simple_contrast(levels)
        return ContrastMatrix(contrast, _name_levels("Simp.", levels[:-1]))

hsb2.groupby("race")["write"].mean().mean()

51.67837643678162

contrast = Simple().code_without_intercept(levels)
print(contrast.matrix)

[[-0.25 -0.25 -0.25]
 [ 0.75 -0.25 -0.25]
 [-0.25  0.75 -0.25]
 [-0.25 -0.25  0.75]]

mod = ols("write ~ C(race, Simple)", data=hsb2)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.78e-05
Time:                        10:25:31   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                  51.6784      0.982     52.619      0.000      49.741      53.615
C(race, Simple)[Simp.1]    11.5417      3.286      3.512      0.001       5.061      18.022
C(race, Simple)[Simp.2]     1.7417      2.732      0.637      0.525      -3.647       7.131
C(race, Simple)[Simp.3]     7.5968      1.989      3.820      0.000       3.675      11.519
==============================================================================
Omnibus:                       10.487   Durbin-Watson:                   1.779
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               11.031
Skew:                          -0.551   Prob(JB):                      0.00402
Kurtosis:                       2.670   Cond. No.                         7.03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

from patsy.contrasts import Sum

contrast = Sum().code_without_intercept(levels)
print(contrast.matrix)

[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [-1. -1. -1.]]

mod = ols("write ~ C(race, Sum)", data=hsb2)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.78e-05
Time:                        10:25:31   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=====================================================================================
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            51.6784      0.982     52.619      0.000      49.741      53.615
C(race, Sum)[S.1]    -5.2200      1.631     -3.200      0.002      -8.437      -2.003
C(race, Sum)[S.2]     6.3216      2.160      2.926      0.004       2.061      10.582
C(race, Sum)[S.3]    -3.4784      1.732     -2.008      0.046      -6.895      -0.062
==============================================================================
Omnibus:                       10.487   Durbin-Watson:                   1.779
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               11.031
Skew:                          -0.551   Prob(JB):                      0.00402
Kurtosis:                       2.670   Cond. No.                         6.72
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

hsb2.groupby("race")["write"].mean().mean()

51.67837643678162

from patsy.contrasts import Diff

contrast = Diff().code_without_intercept(levels)
print(contrast.matrix)

[[-0.75 -0.5  -0.25]
 [ 0.25 -0.5  -0.25]
 [ 0.25  0.5  -0.25]
 [ 0.25  0.5   0.75]]

mod = ols("write ~ C(race, Diff)", data=hsb2)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.78e-05
Time:                        10:25:31   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
======================================================================================
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             51.6784      0.982     52.619      0.000      49.741      53.615
C(race, Diff)[D.1]    11.5417      3.286      3.512      0.001       5.061      18.022
C(race, Diff)[D.2]    -9.8000      3.388     -2.893      0.004     -16.481      -3.119
C(race, Diff)[D.3]     5.8552      2.153      2.720      0.007       1.610      10.101
==============================================================================
Omnibus:                       10.487   Durbin-Watson:                   1.779
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               11.031
Skew:                          -0.551   Prob(JB):                      0.00402
Kurtosis:                       2.670   Cond. No.                         8.30
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

res.params["C(race, Diff)[D.1]"]
hsb2.groupby("race").mean()["write"][2] - hsb2.groupby("race").mean()["write"][1]

11.541666666666664

from patsy.contrasts import Helmert

contrast = Helmert().code_without_intercept(levels)
print(contrast.matrix)

[[-1. -1. -1.]
 [ 1. -1. -1.]
 [ 0.  2. -1.]
 [ 0.  0.  3.]]

mod = ols("write ~ C(race, Helmert)", data=hsb2)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.78e-05
Time:                        10:25:31   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                51.6784      0.982     52.619      0.000      49.741      53.615
C(race, Helmert)[H.2]     5.7708      1.643      3.512      0.001       2.530       9.011
C(race, Helmert)[H.3]    -1.3431      0.867     -1.548      0.123      -3.054       0.368
C(race, Helmert)[H.4]     0.7923      0.372      2.130      0.034       0.059       1.526
==============================================================================
Omnibus:                       10.487   Durbin-Watson:                   1.779
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               11.031
Skew:                          -0.551   Prob(JB):                      0.00402
Kurtosis:                       2.670   Cond. No.                         7.26
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

grouped = hsb2.groupby("race")
grouped.mean()["write"].loc[4] - grouped.mean()["write"].loc[:3].mean()

3.169061302681982

grouped.mean()["write"]

race
1    46.458333
2    58.000000
3    48.200000
4    54.055172
Name: write, dtype: float64

k = 4
c4 = 1.0 / k * (grouped.mean()["write"].loc[k] - grouped.mean()["write"].loc[: k - 1].mean())
k = 3
c3 = 1.0 / k * (grouped.mean()["write"].loc[k] - grouped.mean()["write"].loc[: k - 1].mean())
print(c4, c3)

0.7922653256704955 -1.3430555555555561

hsb2["readcat"] = np.asarray(pd.cut(hsb2.read, bins=4))
hsb2["readcat"] = hsb2["readcat"].astype(object)
hsb2.groupby("readcat").mean()["write"]

readcat
(27.952, 40.0]    42.772727
(40.0, 52.0]      49.978495
(52.0, 64.0]      56.563636
(64.0, 76.0]      61.833333
Name: write, dtype: float64

from patsy.contrasts import Poly

levels = hsb2.readcat.unique()
contrast = Poly().code_without_intercept(levels)
print(contrast.matrix)

[[-0.67082039  0.5        -0.2236068 ]
 [-0.2236068  -0.5         0.67082039]
 [ 0.2236068  -0.5        -0.67082039]
 [ 0.67082039  0.5         0.2236068 ]]

mod = ols("write ~ C(readcat, Poly)", data=hsb2)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  write   R-squared:                       0.346
Model:                            OLS   Adj. R-squared:                  0.336
Method:                 Least Squares   F-statistic:                     34.51
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.95e-18
Time:                        10:25:31   Log-Likelihood:                -690.69
No. Observations:                 200   AIC:                             1389.
Df Residuals:                     196   BIC:                             1403.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================================
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                     52.7870      0.634     83.268      0.000      51.537      54.037
C(readcat, Poly).Linear       14.2587      1.484      9.607      0.000      11.332      17.186
C(readcat, Poly).Quadratic    -0.9680      1.268     -0.764      0.446      -3.468       1.532
C(readcat, Poly).Cubic        -0.1554      1.006     -0.154      0.877      -2.140       1.829
==============================================================================
Omnibus:                        4.467   Durbin-Watson:                   1.768
Prob(Omnibus):                  0.107   Jarque-Bera (JB):                4.289
Skew:                          -0.307   Prob(JB):                        0.117
Kurtosis:                       2.628   Cond. No.                         3.01
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

コントラストの概要¶

例題データ¶

処置（ダミー）符号化¶

シンプル符号化¶

合計（偏差）符号化¶

後方差分符号化¶

ヘルマート符号化¶

直交多項式符号化¶

	id	female	race	ses	schtyp	prog	read	write	math	science	socst
0	70	0	4	1	1	1	57	52	41	47	57
1	121	1	4	2	1	3	68	59	53	63	61
2	86	0	4	3	1	1	44	33	54	58	31
3	141	0	4	3	1	3	63	44	47	53	56
4	172	0	4	2	1	2	47	52	57	53	61
5	113	0	4	2	1	2	44	52	51	63	61
6	50	0	3	2	1	1	50	59	42	53	61
7	11	0	1	2	1	2	34	46	45	39	36
8	84	0	4	2	1	1	63	57	54	58	51
9	48	0	3	2	1	2	57	55	52	50	51

	1	2	3	4
0	False	False	False	True
1	False	False	False	True
2	False	False	False	True
3	False	False	False	True
4	False	False	False	True
...	...	...	...	...
195	False	True	False	False
196	False	False	False	True
197	False	False	False	True
198	False	False	False	True
199	False	False	False	True

	id	female	race	ses	schtyp	prog	read	write	math	science	socst
0	70	0	4	1	1	1	57	52	41	47	57
1	121	1	4	2	1	3	68	59	53	63	61
2	86	0	4	3	1	1	44	33	54	58	31
3	141	0	4	3	1	3	63	44	47	53	56
4	172	0	4	2	1	2	47	52	57	53	61
5	113	0	4	2	1	2	44	52	51	63	61
6	50	0	3	2	1	1	50	59	42	53	61
7	11	0	1	2	1	2	34	46	45	39	36
8	84	0	4	2	1	1	63	57	54	58	51
9	48	0	3	2	1	2	57	55	52	50	51

	id	female	race	ses	schtyp	prog	read	write	math	science	socst
0	70	0	4	1	1	1	57	52	41	47	57
1	121	1	4	2	1	3	68	59	53	63	61
2	86	0	4	3	1	1	44	33	54	58	31
3	141	0	4	3	1	3	63	44	47	53	56
4	172	0	4	2	1	2	47	52	57	53	61
5	113	0	4	2	1	2	44	52	51	63	61
6	50	0	3	2	1	1	50	59	42	53	61
7	11	0	1	2	1	2	34	46	45	39	36
8	84	0	4	2	1	1	63	57	54	58	51
9	48	0	3	2	1	2	57	55	52	50	51