Skip to content

Commit

Permalink
Initial implementation of bulk_load command (CO-1678)
Browse files Browse the repository at this point in the history
  • Loading branch information
Benn Oshrin committed Nov 17, 2019
1 parent 3eeac2e commit 683b763
Show file tree
Hide file tree
Showing 6 changed files with 463 additions and 173 deletions.
271 changes: 271 additions & 0 deletions app/src/Command/BulkLoadCommand.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
<?php
/**
* COmanage Match Bulk Load Command
*
* Portions licensed to the University Corporation for Advanced Internet
* Development, Inc. ("UCAID") under one or more contributor license agreements.
* See the NOTICE file distributed with this work for additional information
* regarding copyright ownership.
*
* UCAID licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* @link http://www.internet2.edu/comanage COmanage Project
* @package match
* @since COmanage Match v1.0.0
* @license Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
*/

declare(strict_types = 1);

namespace App\Command;

use App\Application;
use Cake\Console\Arguments;
use Cake\Console\Command;
use Cake\Console\CommandRunner;
use Cake\Console\ConsoleIo;
use Cake\Console\ConsoleOptionParser;
use Cake\Datasource\Exception\RecordNotFoundException;
use Cake\ORM\TableRegistry;
use Cake\Utility\Hash;
use Cake\Utility\Security;

//use \App\Lib\Enum\PermissionEnum;
use \App\Lib\Enum\ConfidenceModeEnum;

class BulkLoadCommand extends Command {
/**
* Register command specific options.
*
* @since COmanage Match v1.0.0
* @param ConsoleOptionParser $parser Console Option Parser
* @return ConsoleOptionParser Console Option Parser
*/

public function buildOptionParser(ConsoleOptionParser $parser) {
$parser->addOption('matchgrid-id', [
'help' => __('match.fd.matchgrid_id')
])
->addOption('skip-match', [
'help' => __('match.cmd.opt.skip-match'),
'boolean' => true
]);

return $parser;
}

/**
* Execute the Setup Command.
*
* @since COmanage Match v1.0.0
* @param Arguments $args Command Arguments
* @param ConsoleIo $io Console IO
*/

public function execute(Arguments $args, ConsoleIo $io) {
// Verify that we have a valid matchgrid
$Matchgrids = TableRegistry::get('Matchgrids');

try {
$mg = $Matchgrids->getMatchgridConfig($args->getOption('matchgrid-id'));
}
catch(RecordNotFoundException $e) {
$io->out(__("match.er.notfound", [__("match.ct.matchgrids", [1]), $args->getOption('matchgrid-id')]));
return;
}

// Pull the list of Systems of Record so we can validate the inbound records
$SoR = TableRegistry::get('SystemsOfRecord');

$sors = $SoR->find('list', ['keyField' => 'label', 'valueField' => 'id'])
->where(['matchgrid_id' => $mg->id])
->toArray();

// Open the input file and read the field header
$handle = fopen($args->getArgumentAt(0), "r");

if(!$handle)
return;

// Read the header
$attrIndex = fgetcsv($handle);

// And flip the array so we know the positions
$attrs = array_flip($attrIndex);

// Map the attribute name (as provided in the CSV header) to the API name
// as configured in the Attribute object. We could do something like
// Hash::combine($mg->attributes, '{n}.name', '{n}.api_name');
// but we need to append the attribute group name for those attributes that
// have one, and there's not an elegant way to do that with Hash.
// So we just walk the list ourselves.

$attrApiNames = [];

foreach($mg->attributes as $a) {
$k = $a->name;
$v = $a->api_name;

if(!empty($a->attribute_group->name)) {
$v .= "/" . $a->attribute_group->name;
}

$attrApiNames[$k] = $v;
}

try {
$MatchService = new \App\Lib\Match\MatchService();

$MatchService->connect();
$MatchService->setConfig($mg->id);

// First see if table is already built. If not, throw an error that the
// admin should build it first. Strictly speaking, this is not required,
// but it provides a sanity check that the admin has properly configured
// the Matchgrid.

if($MatchService->getRowCount() === false) {
// Table does not exist, throw an error

throw new \RuntimeException(__('match.er.table', [$mg->table_name]));
}

$io->out(__('match.cmd.bl.start', [$mg->table_name]));

$MatchgridBuilder = new \App\Lib\Match\MatchgridBuilder();

// Track some statistics
$cnt = 0; // Number of records
$errcnt = 0; // Number of records that failed
$fuzzycnt = 0; // Number of records that resulted in potential matches (only if NOT skip-match)
$starttime = time(); // Time run started

if($args->getOption('skip-match')) {
// Drop the indexes
$io->out(__('match.cmd.bl.index.off'));
$MatchgridBuilder->build($mg, $mg->attributes, false);
}

while(($data = fgetcsv($handle)) != false) {
$cnt++;

$AttributeManager = null;
$dataByApiName = [];
$sor = "";
$sorid = "";
$referenceId = "new";

try {
// Verify that we have a valid SOR label
if(!isset($sors[ $data[ $attrs['sor'] ]])) {
throw new \InvalidArgumentException("Unknown SOR Label: " . $data[ $attrs['sor'] ]); // XXX I18n
}

// Map the data into an array keyed on the API Name, which is what
// AttributeManager wants to work with
foreach($data as $index => $value) {
if(!isset($attrIndex[$index])) {
throw new \InvalidArgumentException("Column " . $index . " does not map to a valid attribute name"); // XXX I18n
}

switch($attrIndex[$index]) {
case 'referenceid':
if($value) {
$referenceId = $value;
$dataByApiName['referenceid'] = $value;
}
break;
case 'sor':
$sor = $value;
break;
case 'sorid':
$sorid = $value;
break;
default:
$dataByApiName[ $attrApiNames[ $attrIndex[$index] ] ] = $value;
break;
}
}

// Instantiate the AttributeManager
$AttributeManager = new \App\Lib\Match\AttributeManager();

$AttributeManager->parseFromArray($dataByApiName);

// We skip matching if configured, or if the individual record already
// has a Reference ID attached.
if($args->getOption('skip-match') || $referenceId != 'new') {
// This will do the right thing, whether $referenceId is valid or "new"
$MatchService->attachReferenceId($sor, $sorid, $AttributeManager, $referenceId);
} else {
$results = $MatchService->searchReferenceId($sor, $sorid, $AttributeManager);

// Did any rules run successfully? If not (eg: no attributes provided in the
// request, no rules defined) then throw an error.
if(empty($results->getSuccessfulRules())) {
throw new \RuntimeException(__('match.er.rules.unsuccessful'));
}

if($results->count() == 0) {
// No match
$MatchService->assignReferenceId($sor, $sorid, $AttributeManager);
} elseif($results->getConfidenceMode() == ConfidenceModeEnum::Canonical) {
// Exact match
$refIds = $results->getReferenceIds();

if(!empty($refIds[0])) {
$MatchService->attachReferenceId($sor, $sorid, $AttributeManager, (string)$refIds[0]);
}
} else {
// Fuzzy match, we insert the record but do NOT send notification
$matchRequest = $MatchService->insertPending($sor, $sorid, $AttributeManager);

$io->out("Match request unresolved for " . $sor . "/" . $sorid . " (assigned request ID " . $matchRequest . ")"); // XXX I18n
$fuzzycnt++;
}
}
}
catch(\Exception $e) {
$io->out(__('match.er.bl.line', [$cnt, $e->getMessage()]));
$errcnt++;
}

// Since we might be in a very large file, unset our objects in each
// iteration to flag the memory for reclamation

unset($AttributeManager);
unset($dataByApiName);

if($cnt % 100 == 0) {
$io->out(__('match.cmd.bl.progress', [$cnt]));
}
}

$elapsed = time() - $starttime;

$io->out(__('match.cmd.bl.summary', [$cnt, $errcnt, $fuzzycnt, $elapsed]));

if($args->getOption('skip-match')) {
// Recreate the indexes, even on load error
$io->out(__('match.cmd.bl.index.on'));
$MatchgridBuilder->build($mg, $mg->attributes, true);
}

$MatchService->disconnect();
}
catch(\Exception $e) {
$io->out($e->getMessage());
return;
}
}
}
33 changes: 33 additions & 0 deletions app/src/Lib/Match/AttributeManager.php
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,39 @@ public function getValueByContext(string $attribute, string $context="_default")
return null;
}

/**
* Load an array into the Attribute Manager.
*
* @since COmanage Match v1.0.0
* @param array $attributes Array where keys are API Names and values are attribute values
* @throws RuntimeException
*/

public function parseFromArray(array $attributes) {
// First grab the requested Reference ID, if specified.

if(!empty($attributes['referenceid'])) {
$this->requestedReferenceId = $attributes['referenceid'];
}

// API Names might be of the form
// (1) simple, eg "dateOfBirth", context = "_default" (or NULL)
// (2) typed, eg "identifiers:identifier/national", context = type (eg: "national")
// (3) grouped, eg "names:given/official", context = group name
// We treat the second and third forms the same.

foreach($attributes as $a => $v) {
$bits = explode('/', $a, 2);

if(!empty($bits[1])) {
$this->attributes[ $bits[1] ][ $bits[0] ] = $v;
} else {
// Simple attribute
$this->attributes['_default'][$a] = $v;
}
}
}

/**
* Load a JSON object (as returned from json_decode) into the Attribute Manager.
*
Expand Down
24 changes: 24 additions & 0 deletions app/src/Lib/Match/MatchService.php
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,30 @@ public function getRequestsForReferenceId(string $referenceId) {
return $results;
}

/**
* Determine the number of rows in the Matchgrid,
*
* @since COmanage Match v1.0.0
* @param string $sor SOR Label, if provided only count rows for this SOR
* @return int The number of rows counted
*/

public function getRowCount(string $sor="") {
$params = [];

$sql = "SELECT COUNT(*)
FROM " . $this->mgTable;

if($sor != "") {
$sql .= " WHERE sor=?";
$params[] = $sor;
}

$stmt = $this->dbc->Prepare($sql);

return $this->dbc->GetOne($stmt, $params);
}

/**
* Obtain the current attributes for an SOR record.
*
Expand Down
Loading

0 comments on commit 683b763

Please sign in to comment.