-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial implementation of bulk_load command (CO-1678)
- Loading branch information
Benn Oshrin
committed
Nov 17, 2019
1 parent
3eeac2e
commit 683b763
Showing
6 changed files
with
463 additions
and
173 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,271 @@ | ||
| <?php | ||
| /** | ||
| * COmanage Match Bulk Load Command | ||
| * | ||
| * Portions licensed to the University Corporation for Advanced Internet | ||
| * Development, Inc. ("UCAID") under one or more contributor license agreements. | ||
| * See the NOTICE file distributed with this work for additional information | ||
| * regarding copyright ownership. | ||
| * | ||
| * UCAID licenses this file to you under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with the | ||
| * License. You may obtain a copy of the License at: | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| * | ||
| * @link http://www.internet2.edu/comanage COmanage Project | ||
| * @package match | ||
| * @since COmanage Match v1.0.0 | ||
| * @license Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) | ||
| */ | ||
|
|
||
| declare(strict_types = 1); | ||
|
|
||
| namespace App\Command; | ||
|
|
||
| use App\Application; | ||
| use Cake\Console\Arguments; | ||
| use Cake\Console\Command; | ||
| use Cake\Console\CommandRunner; | ||
| use Cake\Console\ConsoleIo; | ||
| use Cake\Console\ConsoleOptionParser; | ||
| use Cake\Datasource\Exception\RecordNotFoundException; | ||
| use Cake\ORM\TableRegistry; | ||
| use Cake\Utility\Hash; | ||
| use Cake\Utility\Security; | ||
|
|
||
| //use \App\Lib\Enum\PermissionEnum; | ||
| use \App\Lib\Enum\ConfidenceModeEnum; | ||
|
|
||
| class BulkLoadCommand extends Command { | ||
| /** | ||
| * Register command specific options. | ||
| * | ||
| * @since COmanage Match v1.0.0 | ||
| * @param ConsoleOptionParser $parser Console Option Parser | ||
| * @return ConsoleOptionParser Console Option Parser | ||
| */ | ||
|
|
||
| public function buildOptionParser(ConsoleOptionParser $parser) { | ||
| $parser->addOption('matchgrid-id', [ | ||
| 'help' => __('match.fd.matchgrid_id') | ||
| ]) | ||
| ->addOption('skip-match', [ | ||
| 'help' => __('match.cmd.opt.skip-match'), | ||
| 'boolean' => true | ||
| ]); | ||
|
|
||
| return $parser; | ||
| } | ||
|
|
||
| /** | ||
| * Execute the Setup Command. | ||
| * | ||
| * @since COmanage Match v1.0.0 | ||
| * @param Arguments $args Command Arguments | ||
| * @param ConsoleIo $io Console IO | ||
| */ | ||
|
|
||
| public function execute(Arguments $args, ConsoleIo $io) { | ||
| // Verify that we have a valid matchgrid | ||
| $Matchgrids = TableRegistry::get('Matchgrids'); | ||
|
|
||
| try { | ||
| $mg = $Matchgrids->getMatchgridConfig($args->getOption('matchgrid-id')); | ||
| } | ||
| catch(RecordNotFoundException $e) { | ||
| $io->out(__("match.er.notfound", [__("match.ct.matchgrids", [1]), $args->getOption('matchgrid-id')])); | ||
| return; | ||
| } | ||
|
|
||
| // Pull the list of Systems of Record so we can validate the inbound records | ||
| $SoR = TableRegistry::get('SystemsOfRecord'); | ||
|
|
||
| $sors = $SoR->find('list', ['keyField' => 'label', 'valueField' => 'id']) | ||
| ->where(['matchgrid_id' => $mg->id]) | ||
| ->toArray(); | ||
|
|
||
| // Open the input file and read the field header | ||
| $handle = fopen($args->getArgumentAt(0), "r"); | ||
|
|
||
| if(!$handle) | ||
| return; | ||
|
|
||
| // Read the header | ||
| $attrIndex = fgetcsv($handle); | ||
|
|
||
| // And flip the array so we know the positions | ||
| $attrs = array_flip($attrIndex); | ||
|
|
||
| // Map the attribute name (as provided in the CSV header) to the API name | ||
| // as configured in the Attribute object. We could do something like | ||
| // Hash::combine($mg->attributes, '{n}.name', '{n}.api_name'); | ||
| // but we need to append the attribute group name for those attributes that | ||
| // have one, and there's not an elegant way to do that with Hash. | ||
| // So we just walk the list ourselves. | ||
|
|
||
| $attrApiNames = []; | ||
|
|
||
| foreach($mg->attributes as $a) { | ||
| $k = $a->name; | ||
| $v = $a->api_name; | ||
|
|
||
| if(!empty($a->attribute_group->name)) { | ||
| $v .= "/" . $a->attribute_group->name; | ||
| } | ||
|
|
||
| $attrApiNames[$k] = $v; | ||
| } | ||
|
|
||
| try { | ||
| $MatchService = new \App\Lib\Match\MatchService(); | ||
|
|
||
| $MatchService->connect(); | ||
| $MatchService->setConfig($mg->id); | ||
|
|
||
| // First see if table is already built. If not, throw an error that the | ||
| // admin should build it first. Strictly speaking, this is not required, | ||
| // but it provides a sanity check that the admin has properly configured | ||
| // the Matchgrid. | ||
|
|
||
| if($MatchService->getRowCount() === false) { | ||
| // Table does not exist, throw an error | ||
|
|
||
| throw new \RuntimeException(__('match.er.table', [$mg->table_name])); | ||
| } | ||
|
|
||
| $io->out(__('match.cmd.bl.start', [$mg->table_name])); | ||
|
|
||
| $MatchgridBuilder = new \App\Lib\Match\MatchgridBuilder(); | ||
|
|
||
| // Track some statistics | ||
| $cnt = 0; // Number of records | ||
| $errcnt = 0; // Number of records that failed | ||
| $fuzzycnt = 0; // Number of records that resulted in potential matches (only if NOT skip-match) | ||
| $starttime = time(); // Time run started | ||
|
|
||
| if($args->getOption('skip-match')) { | ||
| // Drop the indexes | ||
| $io->out(__('match.cmd.bl.index.off')); | ||
| $MatchgridBuilder->build($mg, $mg->attributes, false); | ||
| } | ||
|
|
||
| while(($data = fgetcsv($handle)) != false) { | ||
| $cnt++; | ||
|
|
||
| $AttributeManager = null; | ||
| $dataByApiName = []; | ||
| $sor = ""; | ||
| $sorid = ""; | ||
| $referenceId = "new"; | ||
|
|
||
| try { | ||
| // Verify that we have a valid SOR label | ||
| if(!isset($sors[ $data[ $attrs['sor'] ]])) { | ||
| throw new \InvalidArgumentException("Unknown SOR Label: " . $data[ $attrs['sor'] ]); // XXX I18n | ||
| } | ||
|
|
||
| // Map the data into an array keyed on the API Name, which is what | ||
| // AttributeManager wants to work with | ||
| foreach($data as $index => $value) { | ||
| if(!isset($attrIndex[$index])) { | ||
| throw new \InvalidArgumentException("Column " . $index . " does not map to a valid attribute name"); // XXX I18n | ||
| } | ||
|
|
||
| switch($attrIndex[$index]) { | ||
| case 'referenceid': | ||
| if($value) { | ||
| $referenceId = $value; | ||
| $dataByApiName['referenceid'] = $value; | ||
| } | ||
| break; | ||
| case 'sor': | ||
| $sor = $value; | ||
| break; | ||
| case 'sorid': | ||
| $sorid = $value; | ||
| break; | ||
| default: | ||
| $dataByApiName[ $attrApiNames[ $attrIndex[$index] ] ] = $value; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| // Instantiate the AttributeManager | ||
| $AttributeManager = new \App\Lib\Match\AttributeManager(); | ||
|
|
||
| $AttributeManager->parseFromArray($dataByApiName); | ||
|
|
||
| // We skip matching if configured, or if the individual record already | ||
| // has a Reference ID attached. | ||
| if($args->getOption('skip-match') || $referenceId != 'new') { | ||
| // This will do the right thing, whether $referenceId is valid or "new" | ||
| $MatchService->attachReferenceId($sor, $sorid, $AttributeManager, $referenceId); | ||
| } else { | ||
| $results = $MatchService->searchReferenceId($sor, $sorid, $AttributeManager); | ||
|
|
||
| // Did any rules run successfully? If not (eg: no attributes provided in the | ||
| // request, no rules defined) then throw an error. | ||
| if(empty($results->getSuccessfulRules())) { | ||
| throw new \RuntimeException(__('match.er.rules.unsuccessful')); | ||
| } | ||
|
|
||
| if($results->count() == 0) { | ||
| // No match | ||
| $MatchService->assignReferenceId($sor, $sorid, $AttributeManager); | ||
| } elseif($results->getConfidenceMode() == ConfidenceModeEnum::Canonical) { | ||
| // Exact match | ||
| $refIds = $results->getReferenceIds(); | ||
|
|
||
| if(!empty($refIds[0])) { | ||
| $MatchService->attachReferenceId($sor, $sorid, $AttributeManager, (string)$refIds[0]); | ||
| } | ||
| } else { | ||
| // Fuzzy match, we insert the record but do NOT send notification | ||
| $matchRequest = $MatchService->insertPending($sor, $sorid, $AttributeManager); | ||
|
|
||
| $io->out("Match request unresolved for " . $sor . "/" . $sorid . " (assigned request ID " . $matchRequest . ")"); // XXX I18n | ||
| $fuzzycnt++; | ||
| } | ||
| } | ||
| } | ||
| catch(\Exception $e) { | ||
| $io->out(__('match.er.bl.line', [$cnt, $e->getMessage()])); | ||
| $errcnt++; | ||
| } | ||
|
|
||
| // Since we might be in a very large file, unset our objects in each | ||
| // iteration to flag the memory for reclamation | ||
|
|
||
| unset($AttributeManager); | ||
| unset($dataByApiName); | ||
|
|
||
| if($cnt % 100 == 0) { | ||
| $io->out(__('match.cmd.bl.progress', [$cnt])); | ||
| } | ||
| } | ||
|
|
||
| $elapsed = time() - $starttime; | ||
|
|
||
| $io->out(__('match.cmd.bl.summary', [$cnt, $errcnt, $fuzzycnt, $elapsed])); | ||
|
|
||
| if($args->getOption('skip-match')) { | ||
| // Recreate the indexes, even on load error | ||
| $io->out(__('match.cmd.bl.index.on')); | ||
| $MatchgridBuilder->build($mg, $mg->attributes, true); | ||
| } | ||
|
|
||
| $MatchService->disconnect(); | ||
| } | ||
| catch(\Exception $e) { | ||
| $io->out($e->getMessage()); | ||
| return; | ||
| } | ||
| } | ||
| } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.