In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import matplotlib as mpl
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelcolor'] = 'black'
mpl.rcParams['axes.labelsize'] = 20
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['lines.markersize'] = 10
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

# customize IPython's style
from IPython.core.display import HTML
def css_styling():
    styles = open("custom.css", "r").read() #or edit path to custom.css
    return HTML(styles)
css_styling()
Out[1]:

ROOT and NumPy

An Introduction with Examples

Noel Dawe

Introduction

  • Why NumPy? numpy.org

  • NumPy is the fundamental package for scientific computing with Python. It contains among other things:

    • a powerful N-dimensional array object
    • sophisticated (broadcasting) functions
    • tools for integrating C/C++ and Fortran code
    • useful linear algebra, Fourier transform, and random number capabilities

But how does this help me if all of my data is in ROOT format?

In [40]:
import numpy as np

arr = np.array([1, 2, 3, 3])
arr.mean(), arr.std(), arr.min(), arr.max(), arr.sum()
Out[40]:
(2.25, 0.82915619758884995, 1, 3, 9)
In [3]:
# Unique elements in an array
np.unique(arr)
Out[3]:
array([1, 2, 3])
In [4]:
# Invert a matrix
np.linalg.inv([[3, 1], 
               [2, 8]])
Out[4]:
array([[ 0.36363636, -0.04545455],
       [-0.09090909,  0.13636364]])
In [5]:
# Sample a multivariate normal
np.random.multivariate_normal(np.ones(2), np.diag(np.ones(2)), 5)
Out[5]:
array([[ 1.23472433,  2.04601652],
       [ 2.52191207,  2.23898658],
       [-0.19101278, -0.29345934],
       [ 2.40219927,  1.74374308],
       [ 0.99379826,  1.55826936]])

root_numpy: ROOT + NumPy

  • A Python extension module
  • Internals are compiled C++ and is faster than pure Python
  • Convert between TTrees and NumPy arrays, fill histograms with NumPy arrays, random sample ROOT functions, etc.
In [6]:
import ROOT
from root_numpy import root2array
from root_numpy.testdata import get_filepath

filename = get_filepath('single1.root')

# Convert a TTree in a ROOT file into a NumPy structured array
arr = root2array(filename, 'tree')
# The TTree name is always optional if there is only one TTree in the file

arr[:5]
Out[6]:
array([(1, 1.0, 1.0), (2, 3.0, 4.0), (3, 5.0, 7.0), (4, 7.0, 10.0),
       (5, 9.0, 13.0)], 
      dtype=[('n_int', '<i4'), ('f_float', '<f4'), ('d_double', '<f8')])
In [7]:
from root_numpy import tree2rec

# Get the TTree from the ROOT file
rfile = ROOT.TFile(get_filepath('test.root'))
intree = rfile.Get('tree')

# Convert to an array (with expressions, selection, etc.)
rec = tree2rec(intree,
    branches=['x', 'y', 'sqrt(y)', 'TMath::Landau(x)', 'cos(x)*sin(y)'],
    selection='z > 0')
In [8]:
from root_numpy import array2tree, array2root

# Rename the fields (branches)
rec.dtype.names = ('x', 'y', 'sqrt_y', 'landau_x', 'cos_x_sin_y')

# Convert the NumPy record array into a TTree
tree = array2tree(rec, name='tree')
print(tree.GetEntries())
print([b.GetName() for b in tree.GetListOfBranches()])

# Dump directly into a ROOT file without touching PyROOT
array2root(rec, 'selected_tree.root', 'tree')
56
['x', 'y', 'sqrt_y', 'landau_x', 'cos_x_sin_y']
In [39]:
import numpy as np
from rootpy.plotting import Hist2D, Canvas
from rootpy.plotting.style import set_style; set_style('ATLAS')
from root_numpy import fill_hist

# Fill a ROOT histogram from a NumPy array
hist = Hist2D(20, -3, 3, 20, -3, 3, drawstyle='LEGO2')
fill_hist(hist, np.random.randn(1E6, 2))
hist
INFO:rootpy.plotting.style:using ROOT style 'ATLAS'
Out[39]:

Machine Learning with TMVA

Let's construct a simple classification problem with NumPy arrays and train a classifier with TMVA

In [10]:
import matplotlib.pyplot as plt
from numpy.random import RandomState; RNG = RandomState(42)
from root_numpy.tmva import add_classification_events, evaluate_reader
from ROOT import TMVA, TFile, TCut

# Construct an example dataset for binary classification
n_vars = 2
n_events = 1000
signal = RNG.multivariate_normal(
    np.ones(n_vars), np.diag(np.ones(n_vars)), n_events)
background = RNG.multivariate_normal(
    np.ones(n_vars) * -1, np.diag(np.ones(n_vars)), n_events)

X = np.concatenate([signal, background])
y = np.ones(X.shape[0])
w = RNG.randint(1, 10, n_events * 2)
y[signal.shape[0]:] *= -1
permute = RNG.permutation(y.shape[0])
X = X[permute]; y = y[permute]
In [11]:
# Split into training and test datasets
X_train, y_train, w_train = X[:n_events], y[:n_events], w[:n_events]
X_test, y_test, w_test = X[n_events:], y[n_events:], w[n_events:]

output = TFile('tmva_output.root', 'recreate')
factory = TMVA.Factory('classifier', output,
                       'AnalysisType=Classification:'
                       '!V:Silent:!DrawProgressBar')
for n in range(n_vars):
    factory.AddVariable('f{0}'.format(n), 'F')

# Call root_numpy's utility functions to add events from the arrays
add_classification_events(factory, X_train, y_train, weights=w_train)
add_classification_events(factory, X_test, y_test, weights=w_test, test=True)

# Train a classifier
factory.PrepareTrainingAndTestTree(TCut('1'), 'NormMode=EqualNumEvents')
factory.BookMethod('Fisher', 'Fisher',
                   'Fisher:VarTransform=None:CreateMVAPdfs:'
                   'PDFInterpolMVAPdf=Spline2:NbinsMVAPdf=50:'
                   'NsmoothMVAPdf=10')
factory.TrainAllMethods()
In [12]:
def plot(reader, twoclass_output):
    plot_colors = "br"
    plot_step = 0.02
    class_names = "AB"
    cmap = plt.get_cmap('bwr')

    plt.figure(figsize=(10.5, 5))

    # Plot the decision boundaries
    plt.subplot(121)
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = evaluate_reader(reader, 'Fisher', np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=cmap, vmin=Z.min(), vmax=Z.max(),
                 levels=np.linspace(Z.min(), Z.max(), 50))
    plt.contour(xx, yy, Z, levels=[0], linestyles='dashed')
    plt.axis("tight")

    # Plot the training points
    for i, n, c in zip([-1, 1], class_names, plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1],
                    c=c, cmap=cmap,
                    label="Class %s" % n)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.legend(loc='upper right')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Decision Boundary')

    # Plot the two-class decision scores
    plot_range = (twoclass_output.min(), twoclass_output.max())
    plt.subplot(122)
    for i, n, c in zip([-1, 1], class_names, plot_colors):
        plt.hist(twoclass_output[y_test == i],
                 bins=10,
                 range=plot_range,
                 facecolor=c,
                 label='Class %s' % n,
                 alpha=.5)
    x1, x2, y1, y2 = plt.axis()
    plt.axis((x1, x2, y1, y2 * 1.2))
    plt.legend(loc='upper right')
    plt.ylabel('Samples')
    plt.xlabel('Score')
    plt.title('Decision Scores')

    plt.tight_layout()
    plt.subplots_adjust(wspace=0.35)
In [13]:
from array import array

# Classify the test dataset with the classifier
reader = TMVA.Reader()
for n in range(n_vars):
    reader.AddVariable('f{0}'.format(n), array('f', [0.]))
reader.BookMVA('Fisher', 'weights/classifier_Fisher.weights.xml')
twoclass_output = evaluate_reader(reader, 'Fisher', X_test)
print(twoclass_output[:5])
plot(reader, twoclass_output) # my magic function...
[ 1.38821707  2.28084694 -1.23773473  1.7120243   0.6922358 ]

Regression with TMVA

In [14]:
def plot_regression(X, y, y_1=None, y_2=None):
    # Plot the results
    plt.figure(figsize=(8, 6))
    plt.scatter(X, y, c="k", label="training samples")
    if y_1 is not None:
        plt.plot(X, y_1, c="g", label="1 tree", linewidth=2)
        plt.plot(X, y_2, c="r", label="300 trees", linewidth=2)
    plt.xlabel("data")
    plt.ylabel("target")
    if y_1 is not None:
        plt.title("Boosted Decision Tree Regression")
        plt.legend()
    plt.show()
In [15]:
from root_numpy.tmva import add_regression_events

# Create an example regression dataset
RNG = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + \
    np.sin(6 * X).ravel() + \
    RNG.normal(0, 0.1, X.shape[0])
plot_regression(X, y)
In [16]:
# Fit a regression model
output = TFile('tmva_output.root', 'recreate')
factory = TMVA.Factory('regressor', output,
                       'AnalysisType=Regression:'
                       '!V:Silent:!DrawProgressBar')
factory.AddVariable('x', 'F')
factory.AddTarget('y', 'F')

add_regression_events(factory, X, y)
add_regression_events(factory, X, y, test=True)
factory.PrepareTrainingAndTestTree(TCut('1'), 'NormMode=EqualNumEvents')
factory.BookMethod('BDT', 'BDT1',
                   'nCuts=20:NTrees=1:MaxDepth=4:BoostType=AdaBoostR2:'
                   'SeparationType=RegressionVariance')
factory.BookMethod('BDT', 'BDT2',
                   'nCuts=20:NTrees=300:MaxDepth=4:BoostType=AdaBoostR2:'
                   'SeparationType=RegressionVariance')
factory.TrainAllMethods()
In [17]:
# Predict the regression target
reader = TMVA.Reader()
reader.AddVariable('x', array('f', [0.]))
reader.BookMVA('BDT1', 'weights/regressor_BDT1.weights.xml')
reader.BookMVA('BDT2', 'weights/regressor_BDT2.weights.xml')
y_1 = evaluate_reader(reader, 'BDT1', X)
y_2 = evaluate_reader(reader, 'BDT2', X)
plot_regression(X, y, y_1, y_2)

Machine Learning with scikit-learn

  • TMVA is the gold standard for machine learning in HEP
  • scikit-learn is a powerful alternative, but is Python-based and works with NumPy arrays: scikit-learn.org
  • Now we can use the same data in ROOT with scikit-learn
In [18]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

from root_numpy.tmva import add_classification_events, evaluate_reader
In [19]:
def plot_sample(X_train, y_train):
    signal = y_train > 0
    background = y_train < 0

    plt.figure(figsize=(12, 5))
    plt.subplot(121)
    plt.scatter(X_train[signal, 0], X_train[signal, 1], c='red')
    plt.scatter(X_train[background, 0], X_train[background, 1], c='blue')
    plt.xlabel("Feature 0")
    plt.ylabel("Feature 1")

    plt.subplot(122)
    plt.scatter(X_train[signal, 4], X_train[signal, 5], c='red')
    plt.scatter(X_train[background, 4], X_train[background, 5], c='blue')
    plt.xlabel("Feature 4")
    plt.ylabel("Feature 5")
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.35)
    plt.show()
In [20]:
# Generate an example dataset
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

# Train on the first 2000, test on the rest
X_train, y_train = X[:2000], y[:2000]
X_test, y_test = X[2000:], y[2000:]
plot_sample(X_train, y_train)

First with TMVA

In [21]:
outfile = TFile('/tmp/tmva_output.root', 'recreate')
factory = TMVA.Factory('TMVAClassification', outfile,
                       'AnalysisType=Classification:'
                       '!V:Silent:!DrawProgressBar')
for idx in range(10):
    factory.AddVariable('f{0}'.format(idx), 'F')
add_classification_events(factory, X_train, y_train)
add_classification_events(factory, X_test, y_test, test=True)
factory.PrepareTrainingAndTestTree(TCut('1'), 'NormMode=NumEvents')
factory.BookMethod(TMVA.Types.kBDT, 'BDT', 'nCuts=-1:NTrees=100')
factory.TrainAllMethods()
In [26]:
def show_clf_results(y_test, y_predicted, y_decision, name):
    print "Area under ROC curve: %.4f" % (roc_auc_score(y_test, y_predicted))

    score_range = y_decision.min(), y_decision.max()
    plt.hist(y_decision[y_test>0.],
             color='r', alpha=0.5, range=score_range, bins=30)
    plt.hist(y_decision[y_test<0.],
             color='b', alpha=0.5, range=score_range, bins=30)
    plt.xlabel("{0} BDT output".format(name))
    plt.show()
In [28]:
reader = TMVA.Reader()
for idx in range(10):
    reader.AddVariable('f{0}'.format(idx), array('f', [0.]))
    
reader.BookMVA('BDT', 'weights/TMVAClassification_BDT.weights.xml')
y_decision = evaluate_reader(reader, "BDT", X_test)
y_predicted = 2 * (y_decision > 0) - 1
show_clf_results(y_test, y_predicted, y_decision, 'TMVA')
Area under ROC curve: 0.8720

Now with scikit-learn

In [29]:
dt = DecisionTreeClassifier(max_depth=3,
                            min_samples_leaf=0.05*len(X_train))
bdt = AdaBoostClassifier(dt, algorithm='SAMME',
                         n_estimators=100, learning_rate=0.5)
bdt.fit(X_train, y_train)
y_predicted = bdt.predict(X_test)
y_decision = bdt.decision_function(X_test).ravel()
show_clf_results(y_test, y_predicted, y_decision, 'scikit-learn')
Area under ROC curve: 0.8741