Skip to content

Commit

Permalink
improved scripts for importing and processing classifications.
Browse files Browse the repository at this point in the history
  • Loading branch information
devletech committed Nov 29, 2024
1 parent f7e0833 commit 6233496
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-->
### Added
- classifications and project title metadata in the document's JSON export.
- improved scripts for importing and processing classifications

## v1.0.46.1 - 2024-11-27
### Fixed
Expand Down
1 change: 0 additions & 1 deletion library/Episciences/Classification.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
namespace Episciences;

use Episciences_Tools;
use http\Message\Body;

abstract class Classification implements \JsonSerializable
{
Expand Down
10 changes: 10 additions & 0 deletions library/Episciences/Paper/Classifications.php
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,14 @@ public function setSourceId(int $sourceId): self
}


/**
* @throws Zend_Exception
*/
public function checkClassificationCode(string $code, array $availableClassificationCodes = []): void
{
if (!in_array($code, $availableClassificationCodes, true)) {
throw new Zend_Exception(sprintf('[%s] code not found in %s classifications table', $code, strtoupper($this->classificationName)));
}

}
}
36 changes: 30 additions & 6 deletions scripts/getClassificationJEL.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class GetClassificationJel extends JournalScript
private const ONE_MONTH = 3600 * 24 * 31;
private bool $_dryRun = false;
private Logger $logger;
private array $allClassificationCodes = [];

public function __construct(array $localopts)
{
Expand Down Expand Up @@ -46,14 +47,16 @@ public function run(): void
defineJournalConstants();
$db = Zend_Db_Table_Abstract::getDefaultAdapter();

$this->setAllClassificationCodes();

$this->logger->info('Fetching papers from database');
$select = $db
->select()
->from(T_PAPERS, ["DOI", "DOCID"])
->where('DOI != ""')
->where("STATUS = ?", Episciences_Paper::STATUS_PUBLISHED)
// ->where('RVID = ?', 3)
// ->limit(10)
// ->where('RVID = ?', 3)
// ->limit(10)
->order('DOCID ASC');

$papers = $db->fetchAll($select);
Expand Down Expand Up @@ -136,7 +139,7 @@ public function processApiData(array $jsonData): array
if (isset($subject['@classid']) && $subject['@classid'] === 'jel' && isset($subject['$'])) {
$value = $subject['$'];
if (str_starts_with($value, 'jel:')) {
$processedValue = ltrim($value, 'jel:');
$processedValue = ltrim($value, 'jel:');
if ($processedValue !== '') {
$results[] = $processedValue;
}
Expand All @@ -150,15 +153,25 @@ public function processApiData(array $jsonData): array
return array_unique($results);
}



private function createClassifications(array $jelCodes, int $docId): array
{
$collectionOfClassifications = [];

foreach ($jelCodes as $jelCode) {

$classification = new Episciences_Paper_Classifications();
$classification->setClassificationCode($jelCode);
$classification->setClassificationName(Episciences\Classification\jel::$classificationName);

try {
$classification->checkClassificationCode($jelCode, $this->getAllClassificationCodes());

} catch (Zend_Exception $e) {
$this->logger->warning($e->getMessage());
$this->logger->warning(sprintf('[%s] classification ignored !', $jelCode));
continue;
}

$classification->setClassificationCode($jelCode);
$classification->setDocid($docId);
$classification->setSourceId(Episciences_Repositories::GRAPH_OPENAIRE_ID);
$collectionOfClassifications[] = $classification;
Expand All @@ -176,6 +189,17 @@ public function setDryRun(bool $dryRun): void
{
$this->_dryRun = $dryRun;
}

private function setAllClassificationCodes(): void
{
$sql = $this->getDb()?->select()->from(T_PAPER_CLASSIFICATION_JEL, ['code']);
$this->allClassificationCodes = $this->getDb()?->fetchCol($sql);
}

public function getAllClassificationCodes(): array
{
return $this->allClassificationCodes;
}
}

$script = new GetClassificationJel([]);
Expand Down
30 changes: 29 additions & 1 deletion scripts/getClassificationMsc.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class GetClassificationMsc extends JournalScript
private bool $_dryRun = false;
private Logger $logger;

private array $allClassificationCodes = [];

public function __construct(array $localopts)
{
$this->setRequiredParams([]);
Expand Down Expand Up @@ -46,6 +48,8 @@ public function run(): void
defineJournalConstants();
$db = Zend_Db_Table_Abstract::getDefaultAdapter();

$this->setAllClassificationCodes();

$this->logger->info('Fetching papers from database');
$select = $db
->select()
Expand Down Expand Up @@ -152,11 +156,24 @@ private function extractMSC2020Codes(array $apiResponse): array

private function createClassifications(array $mscCodes, int $docId): array
{

$collectionOfClassifications = [];

foreach ($mscCodes as $mscCode) {

$classification = new Episciences_Paper_Classifications();
$classification->setClassificationCode($mscCode);
$classification->setClassificationName(Episciences\Classification\msc2020::$classificationName);

try {
$classification->checkClassificationCode($mscCode, $this->getAllClassificationCodes());

} catch (Zend_Exception $e) {
$this->logger->warning($e->getMessage());
$this->logger->warning(sprintf('[%s] classification ignored !', $mscCode));
continue;
}

$classification->setClassificationCode($mscCode);
$classification->setDocid($docId);
$classification->setSourceId(Episciences_Repositories::ZBMATH_OPEN);
$collectionOfClassifications[] = $classification;
Expand All @@ -174,6 +191,17 @@ public function setDryRun(bool $dryRun): void
{
$this->_dryRun = $dryRun;
}

private function setAllClassificationCodes(): void
{
$sql = $this->getDb()?->select()->from(T_PAPER_CLASSIFICATION_MSC2020, ['code']);
$this->allClassificationCodes = $this->getDb()?->fetchCol($sql);
}

public function getAllClassificationCodes(): array
{
return $this->allClassificationCodes;
}
}

$script = new GetClassificationMsc([]);
Expand Down
10 changes: 5 additions & 5 deletions scripts/importClassificationJEL.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@
}

echo "[" . date('Y-m-d H:i:s') . "] XML file successfully loaded.\n";
$tableName = 'classification_jel';

// Prepare the SQL dump content
$sqlDump = <<<SQL
-- SQL Dump for JEL Classifications
CREATE TABLE IF NOT EXISTS `classification_jel` (
CREATE TABLE IF NOT EXISTS `$tableName` (
`code` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
`label` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL
`label` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
PRIMARY KEY (`code`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
ALTER TABLE `jel` ADD PRIMARY KEY (`code`);
SQL;

$count = 0;
Expand All @@ -52,7 +52,7 @@
$label = htmlspecialchars($label, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');

// Add the INSERT statement to the SQL dump
$sqlDump .= "INSERT INTO `jel` (`code`, `label`) VALUES ('$code', '$label');\n";
$sqlDump .= "INSERT INTO `$tableName` (`code`, `label`) VALUES ('$code', '$label');\n";
$count++;
}

Expand Down
2 changes: 1 addition & 1 deletion scripts/importClassificationMsc2020.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ function logMessage($message): void
$dumpFileContent .= "CREATE TABLE IF NOT EXISTS `$tableName` (\n";
$dumpFileContent .= " `code` varchar(10) COLLATE utf8mb4_general_ci NOT NULL,\n";
$dumpFileContent .= " `label` varchar(255) COLLATE utf8mb4_general_ci NOT NULL,\n";
$dumpFileContent .= " `description` varchar(255) COLLATE utf8mb4_general_ci NOT NULL,\n";
$dumpFileContent .= " `description` mediumtext COLLATE utf8mb4_general_ci NOT NULL,\n";
$dumpFileContent .= " PRIMARY KEY (`code`)\n";
$dumpFileContent .= ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;\n\n";

Expand Down

0 comments on commit 6233496

Please sign in to comment.