How to speed up evaluation of complex metrics


#1

After the responses in evalMetricsWithTimeseries?, I coded up a somewhat general approach to optimizing evalMetrics of complex metrics that involve (as part) aggregations across many sources (e.g., using rollupMetric). We have a hierarchy of Facility with a site at the top, buildings in the middle and housing at the bottom. The complex metrics are formulas for a housing, containing the sums of other metrics over all the housings of the site as pieces/coefficients. If you just run evalMetrics on all the housings of a site, all the aggregations will be computed from scratch for each of the housings. Caching metrics does not quite cut it here (I am aware of a new work on caching rollupMetric results), feedback is welcome once you have read through the sequel.

The reason such an evaluation may take minutes when run on all the housings of a site is that the optimizer is lacking information of what can be evaluated only once and reused. In our case, we know that some metrics evaluate to the same result for all the housings but how can we express that to the metric engine (in order to avoid hand-coding)? These are the main parts of the answer:

  1. bindings can be used to pass timeseries to evalMetrics without having to modify the metrics being evaluated: the timeseries bound to metric names take precedence over evaluation of those metrics
  2. metrics have field tags, which we use to tag compound metrics whose “scope” is the whole site, i.e., their result is the same when evaluated on any housing source/id
  3. some simple metrics in our case either have
    1. path which startsWith a field that is global/site-scoped
    2. actionDecl which is always a rollupMetric, i.e., an aggregation across a site
  4. a wrapper around evalMetrics placed in type Memoization, which
    1. analyzes expressions in EvalMetricsSpec and extracts the evaluate-once-only-and-bind ones
    2. runs evalMetrics on those metrics and any one housing, and keeps the result in a local variable
    3. returns the result of evalMetrics on the original expressions, and with bindings being the previous result

This can give a few orders of magnitude speedup for large sites.
Below is our, still specific, relevant skeleton:

// Returns a map of all compound metrics inside one called `metricName`,
// with value including interesting information such as compound submetrics and
// expression expansions involving simple metrics only
function expandCompound(metricName, expansions) {
    var cm = CompoundMetric.get(metricName);
    expansions = expansions || {};
    if (cm && !(metricName in expansions)) {
        var idRe = /[a-zA-Z_][a-zA-Z0-9_]*/gm;
        var e = cm.evalExpr();
        var scm = [];
        var more;
        do {
            more = false;
            var m = idRe.exec(e);
            if (m) {
                more = true;
                // WARNING assuming no static cycles
                var r = expandCompound(m[0], expansions);
                expansions = r.expansions;
                if (r.expr) {
                    e = e.substring(0, m.index) + '(' + r.expr + ')' +
                        e.substring(m.index + m[0].length, e.length);
                    scm = _.contains(scm, m[0]) ? scm : scm.concat(m[0]);
                    idRe.lastIndex = m.index;
                }
            }
        } while (more);
        expansions[metricName] = {
            metric: cm,
            subCompoundMetrics: scm,
            simpleExpr: e
        };
        return { expr: e, expansions: expansions };
    }
    return { expansions: expansions };
}
// Returns the array of all compound metrics involved in `metricName` one
function compoundClosure(metricName, expansions) {
    var c = {};
    c[metricName] = true;
    var q = expansions[metricName].subCompoundMetrics.slice();
    while (q.length) {
        var f = q.pop();
        c[f] = true;
        q = q.concat(expansions[f].subCompoundMetrics.slice());
    }
    return _.keys(c);
}
// Special assumptions:
//    srcType is Facility
//    ems has as field ids the ids of housings of a site
//    expansions is a precomputed map for all compound metrics (can be done once)
// Returns the result of `Facility.evalMetrics(ems)` but faster by memoizing selectively
function evalMetrics(srcType, ems, expansions) {
    var compounds = expansions;
    var simples = {};
    _.each(ems.expressions, function(cmn) { // complete compounds if necessary
        if (!(cmn in compounds)) {
            var r = expandCompound(cmn, compounds); // accumulate for speed
            if (!r.expr)
                throw new Error('compound metric names only allowed as expressions');
            compounds = _.extend(compounds, r.expansions);
        }
        _.each(CompoundMetric.listSimpleMetrics(cmn), function(sm) {
            simples[sm.id] = sm;
        });
    });
    var smFilter = function(sm) { // returns true iff sm is site-scoped
        return (sm.path &&
                (sm.path.startsWith('site') ||
                 sm.path.startsWith('serviceAgreement'))) ||
            (sm.tsDecl && sm.tsDecl.data &&
             (sm.tsDecl.data.startsWith('site') ||
              sm.tsDecl.data.startsWith('serviceAgreement'))) ||
            !!sm.actionDecl; // assuming rollupMetric over site
    };
    var negSmFilter = function(sm) { return !smFilter(sm); };
    var tagPred = function(cm) { // returns true iff cm is site-scoped
        var tags = compounds[cm].metric.tags;
        return tags && tags.indexOf('site_scope') >= 0;
    };
    var negTagPred = function(cm) { return !tagPred(cm); };
    var cmFilter = function(mn) {
        return _.filter(compoundClosure(mn, compounds), tagPred);
    };
    var negCmFilter = function(mn) {
        return _.filter(compoundClosure(mn, compounds), negTagPred);
    };
    var siteExprs = _.union(
        _.flatten(_.map(ems.expressions, cmFilter), true).
            concat(_.pluck(_.filter(simples, smFilter), 'name')));
    var nonSiteExprs = _.union( // for double-checking
        _.flatten(_.map(ems.expressions, negCmFilter), true).
            concat(_.pluck(_.filter(simples, negSmFilter), 'name')));
    log.info(Loggs.strIP( // for debugging purposes
        '## Memoization.evalMetrics expressions: {e}, (non)siteExprs: {x}',
        { e: ems.expressions,
          x: { siteExprs: siteExprs, nonSiteExprs: nonSiteExprs }}));
    var ems2 = ems.clone(); // submetrics to evaluate at most once
    var id = ems2.ids[0];
    ems2.ids = [id]; // whatever id/source, the result is the same
    ems2.expressions = siteExprs;
    var memo = c3Type(srcType).evalMetrics(ems2);
    var ems3 = ems.clone(); // to evaluate using previous evals
    ems3.bindings = memo.result[id];
    return c3Type(srcType).evalMetrics(ems3);
}