diff --git a/app/src/Command/BulkLoadCommand.php b/app/src/Command/BulkLoadCommand.php new file mode 100644 index 000000000..7fa2912ff --- /dev/null +++ b/app/src/Command/BulkLoadCommand.php @@ -0,0 +1,271 @@ +addOption('matchgrid-id', [ + 'help' => __('match.fd.matchgrid_id') + ]) + ->addOption('skip-match', [ + 'help' => __('match.cmd.opt.skip-match'), + 'boolean' => true + ]); + + return $parser; + } + + /** + * Execute the Setup Command. + * + * @since COmanage Match v1.0.0 + * @param Arguments $args Command Arguments + * @param ConsoleIo $io Console IO + */ + + public function execute(Arguments $args, ConsoleIo $io) { + // Verify that we have a valid matchgrid + $Matchgrids = TableRegistry::get('Matchgrids'); + + try { + $mg = $Matchgrids->getMatchgridConfig($args->getOption('matchgrid-id')); + } + catch(RecordNotFoundException $e) { + $io->out(__("match.er.notfound", [__("match.ct.matchgrids", [1]), $args->getOption('matchgrid-id')])); + return; + } + + // Pull the list of Systems of Record so we can validate the inbound records + $SoR = TableRegistry::get('SystemsOfRecord'); + + $sors = $SoR->find('list', ['keyField' => 'label', 'valueField' => 'id']) + ->where(['matchgrid_id' => $mg->id]) + ->toArray(); + + // Open the input file and read the field header + $handle = fopen($args->getArgumentAt(0), "r"); + + if(!$handle) + return; + + // Read the header + $attrIndex = fgetcsv($handle); + + // And flip the array so we know the positions + $attrs = array_flip($attrIndex); + + // Map the attribute name (as provided in the CSV header) to the API name + // as configured in the Attribute object. We could do something like + // Hash::combine($mg->attributes, '{n}.name', '{n}.api_name'); + // but we need to append the attribute group name for those attributes that + // have one, and there's not an elegant way to do that with Hash. + // So we just walk the list ourselves. + + $attrApiNames = []; + + foreach($mg->attributes as $a) { + $k = $a->name; + $v = $a->api_name; + + if(!empty($a->attribute_group->name)) { + $v .= "/" . $a->attribute_group->name; + } + + $attrApiNames[$k] = $v; + } + + try { + $MatchService = new \App\Lib\Match\MatchService(); + + $MatchService->connect(); + $MatchService->setConfig($mg->id); + + // First see if table is already built. If not, throw an error that the + // admin should build it first. Strictly speaking, this is not required, + // but it provides a sanity check that the admin has properly configured + // the Matchgrid. + + if($MatchService->getRowCount() === false) { + // Table does not exist, throw an error + + throw new \RuntimeException(__('match.er.table', [$mg->table_name])); + } + + $io->out(__('match.cmd.bl.start', [$mg->table_name])); + + $MatchgridBuilder = new \App\Lib\Match\MatchgridBuilder(); + + // Track some statistics + $cnt = 0; // Number of records + $errcnt = 0; // Number of records that failed + $fuzzycnt = 0; // Number of records that resulted in potential matches (only if NOT skip-match) + $starttime = time(); // Time run started + + if($args->getOption('skip-match')) { + // Drop the indexes + $io->out(__('match.cmd.bl.index.off')); + $MatchgridBuilder->build($mg, $mg->attributes, false); + } + + while(($data = fgetcsv($handle)) != false) { + $cnt++; + + $AttributeManager = null; + $dataByApiName = []; + $sor = ""; + $sorid = ""; + $referenceId = "new"; + + try { + // Verify that we have a valid SOR label + if(!isset($sors[ $data[ $attrs['sor'] ]])) { + throw new \InvalidArgumentException("Unknown SOR Label: " . $data[ $attrs['sor'] ]); // XXX I18n + } + + // Map the data into an array keyed on the API Name, which is what + // AttributeManager wants to work with + foreach($data as $index => $value) { + if(!isset($attrIndex[$index])) { + throw new \InvalidArgumentException("Column " . $index . " does not map to a valid attribute name"); // XXX I18n + } + + switch($attrIndex[$index]) { + case 'referenceid': + if($value) { + $referenceId = $value; + $dataByApiName['referenceid'] = $value; + } + break; + case 'sor': + $sor = $value; + break; + case 'sorid': + $sorid = $value; + break; + default: + $dataByApiName[ $attrApiNames[ $attrIndex[$index] ] ] = $value; + break; + } + } + + // Instantiate the AttributeManager + $AttributeManager = new \App\Lib\Match\AttributeManager(); + + $AttributeManager->parseFromArray($dataByApiName); + + // We skip matching if configured, or if the individual record already + // has a Reference ID attached. + if($args->getOption('skip-match') || $referenceId != 'new') { + // This will do the right thing, whether $referenceId is valid or "new" + $MatchService->attachReferenceId($sor, $sorid, $AttributeManager, $referenceId); + } else { + $results = $MatchService->searchReferenceId($sor, $sorid, $AttributeManager); + + // Did any rules run successfully? If not (eg: no attributes provided in the + // request, no rules defined) then throw an error. + if(empty($results->getSuccessfulRules())) { + throw new \RuntimeException(__('match.er.rules.unsuccessful')); + } + + if($results->count() == 0) { + // No match + $MatchService->assignReferenceId($sor, $sorid, $AttributeManager); + } elseif($results->getConfidenceMode() == ConfidenceModeEnum::Canonical) { + // Exact match + $refIds = $results->getReferenceIds(); + + if(!empty($refIds[0])) { + $MatchService->attachReferenceId($sor, $sorid, $AttributeManager, (string)$refIds[0]); + } + } else { + // Fuzzy match, we insert the record but do NOT send notification + $matchRequest = $MatchService->insertPending($sor, $sorid, $AttributeManager); + + $io->out("Match request unresolved for " . $sor . "/" . $sorid . " (assigned request ID " . $matchRequest . ")"); // XXX I18n + $fuzzycnt++; + } + } + } + catch(\Exception $e) { + $io->out(__('match.er.bl.line', [$cnt, $e->getMessage()])); + $errcnt++; + } + + // Since we might be in a very large file, unset our objects in each + // iteration to flag the memory for reclamation + + unset($AttributeManager); + unset($dataByApiName); + + if($cnt % 100 == 0) { + $io->out(__('match.cmd.bl.progress', [$cnt])); + } + } + + $elapsed = time() - $starttime; + + $io->out(__('match.cmd.bl.summary', [$cnt, $errcnt, $fuzzycnt, $elapsed])); + + if($args->getOption('skip-match')) { + // Recreate the indexes, even on load error + $io->out(__('match.cmd.bl.index.on')); + $MatchgridBuilder->build($mg, $mg->attributes, true); + } + + $MatchService->disconnect(); + } + catch(\Exception $e) { + $io->out($e->getMessage()); + return; + } + } +} \ No newline at end of file diff --git a/app/src/Lib/Match/AttributeManager.php b/app/src/Lib/Match/AttributeManager.php index c0fc478aa..71923059e 100644 --- a/app/src/Lib/Match/AttributeManager.php +++ b/app/src/Lib/Match/AttributeManager.php @@ -138,6 +138,39 @@ public function getValueByContext(string $attribute, string $context="_default") return null; } + /** + * Load an array into the Attribute Manager. + * + * @since COmanage Match v1.0.0 + * @param array $attributes Array where keys are API Names and values are attribute values + * @throws RuntimeException + */ + + public function parseFromArray(array $attributes) { + // First grab the requested Reference ID, if specified. + + if(!empty($attributes['referenceid'])) { + $this->requestedReferenceId = $attributes['referenceid']; + } + + // API Names might be of the form + // (1) simple, eg "dateOfBirth", context = "_default" (or NULL) + // (2) typed, eg "identifiers:identifier/national", context = type (eg: "national") + // (3) grouped, eg "names:given/official", context = group name + // We treat the second and third forms the same. + + foreach($attributes as $a => $v) { + $bits = explode('/', $a, 2); + + if(!empty($bits[1])) { + $this->attributes[ $bits[1] ][ $bits[0] ] = $v; + } else { + // Simple attribute + $this->attributes['_default'][$a] = $v; + } + } + } + /** * Load a JSON object (as returned from json_decode) into the Attribute Manager. * diff --git a/app/src/Lib/Match/MatchService.php b/app/src/Lib/Match/MatchService.php index 8a5ac52ce..9a0e61f86 100644 --- a/app/src/Lib/Match/MatchService.php +++ b/app/src/Lib/Match/MatchService.php @@ -255,6 +255,30 @@ public function getRequestsForReferenceId(string $referenceId) { return $results; } + /** + * Determine the number of rows in the Matchgrid, + * + * @since COmanage Match v1.0.0 + * @param string $sor SOR Label, if provided only count rows for this SOR + * @return int The number of rows counted + */ + + public function getRowCount(string $sor="") { + $params = []; + + $sql = "SELECT COUNT(*) + FROM " . $this->mgTable; + + if($sor != "") { + $sql .= " WHERE sor=?"; + $params[] = $sor; + } + + $stmt = $this->dbc->Prepare($sql); + + return $this->dbc->GetOne($stmt, $params); + } + /** * Obtain the current attributes for an SOR record. * diff --git a/app/src/Lib/Match/MatchgridBuilder.php b/app/src/Lib/Match/MatchgridBuilder.php index 8822c286c..1ed60367d 100644 --- a/app/src/Lib/Match/MatchgridBuilder.php +++ b/app/src/Lib/Match/MatchgridBuilder.php @@ -33,179 +33,149 @@ use Cake\Datasource\ConnectionManager; use Cake\Utility\Xml; -require(ROOT . DS . "vendor" . DS . "adodb" . DS . "adodb-php" . DS . "adodb-xmlschema03.inc.php"); +use Doctrine\DBAL\DriverManager; +use Doctrine\DBAL\Schema\Comparator; +use Doctrine\DBAL\Schema\Schema; +use Doctrine\DBAL\Schema\SchemaDiff; class MatchgridBuilder { /** * Build the requested Matchgrid. * * @since COmanage Match v1.0.0 - * @param string $tablename Name of Matchgrid table + * @param \Cake\Datasource\EntityInterface $Matchgrid Matchgrid Object * @param array $attributes Array of Attributes + * @param bool $indexes Whether to build indexes (disable for bulk loading only) */ - public function build(string $tablename, array $attributes) { + public function build(\Cake\Datasource\EntityInterface $Matchgrid, array $attributes, bool $indexes=true) { // Connect to the database $dbc = $this->connect(); - // Convert the configuration given into an ADOdb AXMLS document - $xml = $this->configToSchema($dbc, $tablename, $attributes); - - // Execute the XML schema - $this->runSchema($dbc, $xml); + // Build and execute the schema + $this->configToSchema($dbc, $Matchgrid, $attributes, $indexes); // Disconnect - $dbc->Disconnect(); +// No DBAL disconnect? +// $dbc->disconnect(); } /** - * Convert a Matchgrid Attribute configuration into an ADODB schema. + * Convert a Matchgrid Attribute configuration into a DBAL schema. * * @since COmanage Match v1.0.0 - * @param ADOConnection $dbc ADOdb Connection Object - * @param string $tablename Name of Matchgrid table - * @param array $attributes Array of Attributes - * @return string XML Document holding schema + * @param DBALConnection $dbc DBAL Connection Object + * @param \Cake\Datasource\EntityInterface $Matchgrid Matchgrid Object + * @param array $attributes Array of Attributes + * @param bool $indexes Whether to build indexes (disable for bulk loading only) + * @throws Exceptions */ - protected function configToSchema($dbc, string $tablename, array $attributes) { - // We use Cake's XML library because it's simpler to work with. - // This requires constructing an array. + protected function configToSchema($dbc, \Cake\Datasource\EntityInterface $Matchgrid, array $attributes, bool $indexes=true) { + // Unlike ADOdb, there is no native DBAL format. We could create a JSON + // document similar to what DatabaseCommand uses, but the use case is just + // different enough that it's not really worth the effort at the moment. + + // Just let any errors bubble up the stack. + + $schema = new Schema(); + + // Create the table + $table = $schema->createTable("mg_" . $Matchgrid->table_name); + + // For type definitions see https://www.doctrine-project.org/api/dbal/2.9/Doctrine/DBAL/Types/Type.html // There are various mandatory columns that we hardcode here. - $fields = [ - // Primary Key - [ - '@name' => 'id', - '@type' => 'I', - 'key' => [], - 'autoincrement' => [] - ], - // XXX maybe SOR Label and ID should be UI configured so @size can be set? - // SOR Label - [ - '@name' => 'sor', - '@type' => 'C', - '@size' => '64' - ], - // See also ResultManager::getResultsForJson special handling - // SOR ID - [ - '@name' => 'sorid', - '@type' => 'C', - '@size' => '64' - ], - // Reference ID - [ - '@name' => 'referenceid', - '@type' => 'C', - '@size' => '64' - ], - // Request Time - [ - '@name' => 'request_time', - '@type' => 'T' - ], - // Resolution Time - [ - '@name' => 'resolution_time', - '@type' => 'T' - ] - ]; + $table->addColumn("id", "integer", ['autoincrement' => true, 'notnull' => true]); + // Maybe SOR Label and ID should be UI configured so length can be set? + $table->addColumn("sor", "string", ['length' => 64, 'notnull' => true]); + // See also ResultManager::getResultsForJson special handling + $table->addColumn("sorid", "string", ['length' => 64, 'notnull' => true]); + $table->addColumn("referenceid", "string", ['length' => 64, 'notnull' => false]); + $table->addColumn("request_time", "datetime", ['notnull' => true]); + $table->addColumn("resolution_time", "datetime", ['notnull' => false]); // Add in the configured fields foreach($attributes as $attr) { - $fields[] = [ - '@name' => $attr->name, - // XXX everything is a varchar because we don't have a configuration option for field type - '@type' => 'C', - '@size' => '80' - ]; + // XXX everything is a varchar because we don't have a configuration option for field type + $table->addColumn($attr->name, "string", ['length' => 80, 'notnull' => false]); } - // Configure indexes. id should be auto-generated since it is a primary key. - $i = 1; - $indexes = [ - [ - '@name' => 'matchgrid_i'.$i++, - 'col' => 'sor' - ], - [ - '@name' => 'matchgrid_i'.$i++, - 'col' => 'sorid' - ], - [ - '@name' => 'matchgrid_i'.$i++, - 'col' => ['sor','sorid'], - 'unique' => [] - ], - [ - '@name' => 'matchgrid_i'.$i++, - 'col' => 'referenceid' - ], - /* The XML Schema can't handle the specification of NULLS FIRST, so we - need to create that index manually. - [ - '@name' => 'matchgrid_i'.$i++, - 'col' => 'resolution_time', -// Need custom SQL for this (flag @POSTGRESSPECIFIC ?) -// 'nulls first' => [] - ]*/ - ]; + $table->setPrimaryKey(["id"]); + + if($indexes) { + // Since index names need to be unique across the schema, we'll use the + // matchgrid ID to make the names unique. + + // $flags and $options as passed to Index(), but otherwise undocumented + $flags = []; + $options = []; + + $i = 1; + + // Start with the standard indexes + $indexLabel = "matchgrid_" . $Matchgrid->id . "_i"; + $table->addIndex(['sor'], $indexLabel.$i++, $flags, $options); + $table->addIndex(['sorid'], $indexLabel.$i++, $flags, $options); + $table->addUniqueIndex(['sor', 'sorid'], $indexLabel.$i++, $options); + $table->addIndex(['referenceid'], $indexLabel.$i++, $flags, $options); + $table->addIndex(['resolution_time'], $indexLabel.$i++, $flags, $options); + + // Add in indexes for configured fields + foreach($attributes as $attr) { + // We use the Entity ID to provide some level of reproducibility + $table->addIndex([$attr->name], 'matchgrid_' . $Matchgrid->id . '_attr_id'.$attr->id, $flags, $options); + } + } - $dict = NewDataDictionary($dbc); - // XXX we could skip the $i++ for index names and just use the attribute names matchgrid_attr_sor - // (since these shouldn't get renamed by admins) - // createIndexSql also generates a DROP INDEX if we pass REPLACE, however (contrary to - // the documentation at http://adodb.org/dokuwiki/doku.php?id=v5:dictionary:createindexsql) - // we can't add NULLS FIRST this way. - $sql = $dict->createIndexSql('matchgrid_i'.$i++, $tablename, 'resolution_time', ['REPLACE']); + // We're done with the table assembly, so move on to running the schema. - $sql[1] = rtrim($sql[1], ")") . " NULLS FIRST)"; + // This is the SQL that represents the desired state of the database + $toSql = $schema->toSql($dbc->getDatabasePlatform()); - // Add in indexes for configured fields - foreach($attributes as $attr) { - $indexes[] = [ - // We use the Entity ID to provide some level of reproducibility - '@name' => 'matchgrid_attr_id'.$attr->id, - 'col' => $attr->name - ]; - } + // SchemaManager provides info about the database + $sm = $dbc->getSchemaManager(); - // Assemble the schema (ADOdb AXMLS format) - $schema = [ - 'schema' => [ - '@version' => '0.3', - 'table' => [ - '@name' => $tablename, - 'field' => $fields, - 'index' => $indexes - ], - 'sql' => [ - 'query' => $sql - ] - ] - ]; + // The is the current database representation + $curSchema = $sm->createSchema(); - // Convert the schema to XML - $xobj = \Cake\Utility\Xml::fromArray($schema, array('format' => 'tags')); + $fromSql = $curSchema->toSql($dbc->getDatabasePlatform()); - return $xobj->asXML(); + $comparator = new Comparator(); + $schemaDiff = $comparator->compare($curSchema, $schema); + + $diffSql = $schemaDiff->toSaveSql($dbc->getDatabasePlatform()); + + // We don't start a transaction since in general we always want to move to + // the desired state, and if we fail in flight it's probably a bug that + // needs to be fixed. + + foreach($diffSql as $sql) { + if(preg_match("/^CREATE INDEX .* \(resolution_time\)$/", $sql)) { + // We need the resolution_time index to sort NULLS FIRST, and this is + // currently the least painful way to do it + $sql = rtrim($sql, ")") . " NULLS FIRST)"; + } + + $stmt = $dbc->query($sql); + // $stmt just returns the query string so we don't bother examining it + } } /** * Connect to the Database. * * @since COmanage Match v1.0.0 - * @throws RuntimeException + * @return \Doctrine\DBAL\Connection + * @throws DBALException */ protected function connect() { - // There's some overlap between here and DatabaseShell. + // There's some overlap between here and DatabaseCommand. // Use the ConnectionManager to get the database config to pass to adodb. $db = ConnectionManager::get('default'); - + // $db is a ConnectionInterface object $cfg = $db->config(); @@ -214,51 +184,16 @@ protected function connect() { throw new \RuntimeException(__('match.er.db.driver' , [ $cfg['driver'] ])); } - // // This really imples postgres8+ - $dbc = ADONewConnection('postgres9'); - - if(!$dbc->Connect($cfg['host'], - $cfg['username'], - $cfg['password'], - $cfg['database'])) { - throw new \RuntimeException(__('match.er.db.connect', [$dbc->ErrorMsg()])); - } - - return $dbc; - } - - /** - * Run the specified ADOdb Schema. - * - * @since COmanage Match v1.0.0 - * @param ADOConnection $dbc ADOdb Connection Object - * @param string $xml XML document, returned by configToSchema - */ - - protected function runSchema($dbc, string $xml) { - $schema = new \adoSchema($dbc); - - // ParseSchema is generating bad SQL for Postgres. eg: - // ALTER TABLE cm_cos ALTER COLUMN id SERIAL - // which (1) should be ALTER TABLE cm_cos ALTER COLUMN id TYPE SERIAL - // and (2) SERIAL isn't usable in an ALTER TABLE statement - // So we continue on error - // See also CO-1570, etc - $schema->ContinueOnError(true); - - // Parse the XML schema we were passed - $sql = $schema->ParseSchemaString($xml); + $config = new \Doctrine\DBAL\Configuration(); - switch($schema->ExecuteSchema($sql)) { - case 2: // !!! -// $this->out(__('Database schema update successful')); - break; - default: -// $this->out(__('Possibly failed to update database schema')); - break; - } + $cfargs = [ + 'dbname' => $cfg['database'], + 'user' => $cfg['username'], + 'password' => $cfg['password'], + 'host' => $cfg['host'], + 'driver' => ($cfg['driver'] == 'Cake\Database\Driver\Postgres' ? "pdo_pgsql" : "pdo_mysql") + ]; - // XXX After CO-1570 is addressed we should return true/false (or throw an - // exception on error) so an error message can percolate back up the stack. + return DriverManager::getConnection($cfargs, $config); } } diff --git a/app/src/Locale/en_US/default.po b/app/src/Locale/en_US/default.po index 0171012c6..203c3d6c0 100644 --- a/app/src/Locale/en_US/default.po +++ b/app/src/Locale/en_US/default.po @@ -53,6 +53,21 @@ msgid "match.banner.api_users.platform" msgstr "This page is for configuring Platform API Users, which have full read/write access to the entire platform. To create API Users restricted to a given Matchgrid, go to the management page for the desired Matchgrid and select API Users from there.
The Match API is available at {0}" ### Command Line text +msgid "match.cmd.bl.index.off" +msgstr "Dropping matchgrid indexes..." + +msgid "match.cmd.bl.index.on" +msgstr "Rebuilding matchgrid indexes..." + +msgid "match.cmd.bl.progress" +msgstr "{0} records processed..." + +msgid "match.cmd.bl.start" +msgstr "Loading records to matchgrid table mg_{0}" + +msgid "match.cmd.bl.summary" +msgstr "Finished processing {0} records ({1} errors, {2} unresolved matches, total elapsed time {3} seconds)" + msgid "match.cmd.db.ok" msgstr "Database schema update successful" @@ -68,6 +83,9 @@ msgstr "Force a rerun of setup (only if you know what you are doing)"" msgid "match.cmd.opt.not" msgstr "Calculate changes but do not apply" +msgid "match.cmd.opt.skip-match" +msgstr "Do not run Match Rules while processing records" + msgid "match.cmd.se.admin" msgstr "- Creating initial administrator permission" @@ -173,6 +191,9 @@ msgstr "Incorrect arguments provided to {0}" msgid "match.er.attr.req" msgstr "Required attribute {0} not found in request" +msgid "match.er.bl.line" +msgstr "Error at line {0}: {1}" + msgid "match.er.build" msgstr "Error applying matchgrid schema: {0}" @@ -245,6 +266,9 @@ msgstr "{0} does not have any valid permissions" msgid "match.er.search_type" msgstr "Unknown search type '{0}'" +msgid "match.er.table" +msgstr "Matchgrid table %1$s does not exist" + msgid "match.er.val.length" msgstr "Provided value exceeds maximum length of {0}" @@ -293,6 +317,9 @@ msgstr "Invalidates" msgid "match.fd.label" msgstr "Label" +msgid "match.fd.matchgrid_id" +msgstr "Matchgrid ID" + msgid "match.fd.name" msgstr "Name" diff --git a/app/src/Model/Table/MatchgridsTable.php b/app/src/Model/Table/MatchgridsTable.php index 3cec24b1f..26f996802 100644 --- a/app/src/Model/Table/MatchgridsTable.php +++ b/app/src/Model/Table/MatchgridsTable.php @@ -106,7 +106,7 @@ public function build(int $id) { $Builder = new MatchgridBuilder(); - $Builder->build("mg_" . $matchgrid->table_name, $matchgrid->attributes); + $Builder->build($matchgrid, $matchgrid->attributes); return true; } @@ -152,7 +152,7 @@ public function findActiveMatchgrids(Query $query, array $options) { * @throws Cake\Datasource\Exception\InvalidPrimaryKeyException */ - protected function getMatchgridConfig($id) { + public function getMatchgridConfig($id) { return $this->get($id, ['contain' => [ 'Attributes' => 'AttributeGroups'