summaryrefslogtreecommitdiff
path: root/www/wiki/maintenance/importImages.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/maintenance/importImages.php')
-rw-r--r--www/wiki/maintenance/importImages.php523
1 files changed, 523 insertions, 0 deletions
diff --git a/www/wiki/maintenance/importImages.php b/www/wiki/maintenance/importImages.php
new file mode 100644
index 00000000..5db1fa89
--- /dev/null
+++ b/www/wiki/maintenance/importImages.php
@@ -0,0 +1,523 @@
+<?php
+/**
+ * Import one or more images from the local file system into the wiki without
+ * using the web-based interface.
+ *
+ * "Smart import" additions:
+ * - aim: preserve the essential metadata (user, description) when importing media
+ * files from an existing wiki.
+ * - process:
+ * - interface with the source wiki, don't use bare files only (see --source-wiki-url).
+ * - fetch metadata from source wiki for each file to import.
+ * - commit the fetched metadata to the destination wiki while submitting.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ * @author Rob Church <robchur@gmail.com>
+ * @author Mij <mij@bitchx.it>
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+class ImportImages extends Maintenance {
+
+ public function __construct() {
+ parent::__construct();
+
+ $this->addDescription( 'Imports images and other media files into the wiki' );
+ $this->addArg( 'dir', 'Path to the directory containing images to be imported' );
+
+ $this->addOption( 'extensions',
+ 'Comma-separated list of allowable extensions, defaults to $wgFileExtensions',
+ false,
+ true
+ );
+ $this->addOption( 'overwrite',
+ 'Overwrite existing images with the same name (default is to skip them)' );
+ $this->addOption( 'limit',
+ 'Limit the number of images to process. Ignored or skipped images are not counted',
+ false,
+ true
+ );
+ $this->addOption( 'from',
+ "Ignore all files until the one with the given name. Useful for resuming aborted "
+ . "imports. The name should be the file's canonical database form.",
+ false,
+ true
+ );
+ $this->addOption( 'skip-dupes',
+ 'Skip images that were already uploaded under a different name (check SHA1)' );
+ $this->addOption( 'search-recursively', 'Search recursively for files in subdirectories' );
+ $this->addOption( 'sleep',
+ 'Sleep between files. Useful mostly for debugging',
+ false,
+ true
+ );
+ $this->addOption( 'user',
+ "Set username of uploader, default 'Maintenance script'",
+ false,
+ true
+ );
+ // This parameter can optionally have an argument. If none specified, getOption()
+ // returns 1 which is precisely what we need.
+ $this->addOption( 'check-userblock', 'Check if the user got blocked during import' );
+ $this->addOption( 'comment',
+ "Set file description, default 'Importing file'",
+ false,
+ true
+ );
+ $this->addOption( 'comment-file',
+ 'Set description to the content of this file',
+ false,
+ true
+ );
+ $this->addOption( 'comment-ext',
+ 'Causes the description for each file to be loaded from a file with the same name, but '
+ . 'the extension provided. If a global description is also given, it is appended.',
+ false,
+ true
+ );
+ $this->addOption( 'summary',
+ 'Upload summary, description will be used if not provided',
+ false,
+ true
+ );
+ $this->addOption( 'license',
+ 'Use an optional license template',
+ false,
+ true
+ );
+ $this->addOption( 'timestamp',
+ 'Override upload time/date, all MediaWiki timestamp formats are accepted',
+ false,
+ true
+ );
+ $this->addOption( 'protect',
+ 'Specify the protect value (autoconfirmed,sysop)',
+ false,
+ true
+ );
+ $this->addOption( 'unprotect', 'Unprotects all uploaded images' );
+ $this->addOption( 'source-wiki-url',
+ 'If specified, take User and Comment data for each imported file from this URL. '
+ . 'For example, --source-wiki-url="http://en.wikipedia.org/',
+ false,
+ true
+ );
+ $this->addOption( 'dry', "Dry run, don't import anything" );
+ }
+
+ public function execute() {
+ global $wgFileExtensions, $wgUser, $wgRestrictionLevels;
+
+ $processed = $added = $ignored = $skipped = $overwritten = $failed = 0;
+
+ $this->output( "Import Images\n\n" );
+
+ $dir = $this->getArg( 0 );
+
+ # Check Protection
+ if ( $this->hasOption( 'protect' ) && $this->hasOption( 'unprotect' ) ) {
+ $this->fatalError( "Cannot specify both protect and unprotect. Only 1 is allowed.\n" );
+ }
+
+ if ( $this->hasOption( 'protect' ) && trim( $this->getOption( 'protect' ) ) ) {
+ $this->fatalError( "You must specify a protection option.\n" );
+ }
+
+ # Prepare the list of allowed extensions
+ $extensions = $this->hasOption( 'extensions' )
+ ? explode( ',', strtolower( $this->getOption( 'extensions' ) ) )
+ : $wgFileExtensions;
+
+ # Search the path provided for candidates for import
+ $files = $this->findFiles( $dir, $extensions, $this->hasOption( 'search-recursively' ) );
+
+ # Initialise the user for this operation
+ $user = $this->hasOption( 'user' )
+ ? User::newFromName( $this->getOption( 'user' ) )
+ : User::newSystemUser( 'Maintenance script', [ 'steal' => true ] );
+ if ( !$user instanceof User ) {
+ $user = User::newSystemUser( 'Maintenance script', [ 'steal' => true ] );
+ }
+ $wgUser = $user;
+
+ # Get block check. If a value is given, this specified how often the check is performed
+ $checkUserBlock = (int)$this->getOption( 'check-userblock' );
+
+ $from = $this->getOption( 'from' );
+ $sleep = (int)$this->getOption( 'sleep' );
+ $limit = (int)$this->getOption( 'limit' );
+ $timestamp = $this->getOption( 'timestamp', false );
+
+ # Get the upload comment. Provide a default one in case there's no comment given.
+ $commentFile = $this->getOption( 'comment-file' );
+ if ( $commentFile !== null ) {
+ $comment = file_get_contents( $commentFile );
+ if ( $comment === false || $comment === null ) {
+ $this->fatalError( "failed to read comment file: {$commentFile}\n" );
+ }
+ } else {
+ $comment = $this->getOption( 'comment', 'Importing file' );
+ }
+ $commentExt = $this->getOption( 'comment-ext' );
+ $summary = $this->getOption( 'summary', '' );
+
+ $license = $this->getOption( 'license', '' );
+
+ $sourceWikiUrl = $this->getOption( 'source-wiki-url' );
+
+ # Batch "upload" operation
+ $count = count( $files );
+ if ( $count > 0 ) {
+ foreach ( $files as $file ) {
+ if ( $sleep && ( $processed > 0 ) ) {
+ sleep( $sleep );
+ }
+
+ $base = UtfNormal\Validator::cleanUp( wfBaseName( $file ) );
+
+ # Validate a title
+ $title = Title::makeTitleSafe( NS_FILE, $base );
+ if ( !is_object( $title ) ) {
+ $this->output(
+ "{$base} could not be imported; a valid title cannot be produced\n" );
+ continue;
+ }
+
+ if ( $from ) {
+ if ( $from == $title->getDBkey() ) {
+ $from = null;
+ } else {
+ $ignored++;
+ continue;
+ }
+ }
+
+ if ( $checkUserBlock && ( ( $processed % $checkUserBlock ) == 0 ) ) {
+ $user->clearInstanceCache( 'name' ); // reload from DB!
+ if ( $user->isBlocked() ) {
+ $this->output( $user->getName() . " was blocked! Aborting.\n" );
+ break;
+ }
+ }
+
+ # Check existence
+ $image = wfLocalFile( $title );
+ if ( $image->exists() ) {
+ if ( $this->hasOption( 'overwrite' ) ) {
+ $this->output( "{$base} exists, overwriting..." );
+ $svar = 'overwritten';
+ } else {
+ $this->output( "{$base} exists, skipping\n" );
+ $skipped++;
+ continue;
+ }
+ } else {
+ if ( $this->hasOption( 'skip-dupes' ) ) {
+ $repo = $image->getRepo();
+ # XXX: we end up calculating this again when actually uploading. that sucks.
+ $sha1 = FSFile::getSha1Base36FromPath( $file );
+
+ $dupes = $repo->findBySha1( $sha1 );
+
+ if ( $dupes ) {
+ $this->output(
+ "{$base} already exists as {$dupes[0]->getName()}, skipping\n" );
+ $skipped++;
+ continue;
+ }
+ }
+
+ $this->output( "Importing {$base}..." );
+ $svar = 'added';
+ }
+
+ if ( $sourceWikiUrl ) {
+ /* find comment text directly from source wiki, through MW's API */
+ $real_comment = $this->getFileCommentFromSourceWiki( $sourceWikiUrl, $base );
+ if ( $real_comment === false ) {
+ $commentText = $comment;
+ } else {
+ $commentText = $real_comment;
+ }
+
+ /* find user directly from source wiki, through MW's API */
+ $real_user = $this->getFileUserFromSourceWiki( $sourceWikiUrl, $base );
+ if ( $real_user === false ) {
+ $wgUser = $user;
+ } else {
+ $wgUser = User::newFromName( $real_user );
+ if ( $wgUser === false ) {
+ # user does not exist in target wiki
+ $this->output(
+ "failed: user '$real_user' does not exist in target wiki." );
+ continue;
+ }
+ }
+ } else {
+ # Find comment text
+ $commentText = false;
+
+ if ( $commentExt ) {
+ $f = $this->findAuxFile( $file, $commentExt );
+ if ( !$f ) {
+ $this->output( " No comment file with extension {$commentExt} found "
+ . "for {$file}, using default comment. " );
+ } else {
+ $commentText = file_get_contents( $f );
+ if ( !$commentText ) {
+ $this->output(
+ " Failed to load comment file {$f}, using default comment. " );
+ }
+ }
+ }
+
+ if ( !$commentText ) {
+ $commentText = $comment;
+ }
+ }
+
+ # Import the file
+ if ( $this->hasOption( 'dry' ) ) {
+ $this->output(
+ " publishing {$file} by '{$wgUser->getName()}', comment '$commentText'... "
+ );
+ } else {
+ $mwProps = new MWFileProps( MediaWiki\MediaWikiServices::getInstance()->getMimeAnalyzer() );
+ $props = $mwProps->getPropsFromPath( $file, true );
+ $flags = 0;
+ $publishOptions = [];
+ $handler = MediaHandler::getHandler( $props['mime'] );
+ if ( $handler ) {
+ $metadata = Wikimedia\quietCall( 'unserialize', $props['metadata'] );
+
+ $publishOptions['headers'] = $handler->getContentHeaders( $metadata );
+ } else {
+ $publishOptions['headers'] = [];
+ }
+ $archive = $image->publish( $file, $flags, $publishOptions );
+ if ( !$archive->isGood() ) {
+ $this->output( "failed. (" .
+ $archive->getWikiText( false, false, 'en' ) .
+ ")\n" );
+ $failed++;
+ continue;
+ }
+ }
+
+ $commentText = SpecialUpload::getInitialPageText( $commentText, $license );
+ if ( !$this->hasOption( 'summary' ) ) {
+ $summary = $commentText;
+ }
+
+ if ( $this->hasOption( 'dry' ) ) {
+ $this->output( "done.\n" );
+ } elseif ( $image->recordUpload2(
+ $archive->value,
+ $summary,
+ $commentText,
+ $props,
+ $timestamp
+ )->isOK() ) {
+ # We're done!
+ $this->output( "done.\n" );
+
+ $doProtect = false;
+
+ $protectLevel = $this->getOption( 'protect' );
+
+ if ( $protectLevel && in_array( $protectLevel, $wgRestrictionLevels ) ) {
+ $doProtect = true;
+ }
+ if ( $this->hasOption( 'unprotect' ) ) {
+ $protectLevel = '';
+ $doProtect = true;
+ }
+
+ if ( $doProtect ) {
+ # Protect the file
+ $this->output( "\nWaiting for replica DBs...\n" );
+ // Wait for replica DBs.
+ sleep( 2.0 ); # Why this sleep?
+ wfWaitForSlaves();
+
+ $this->output( "\nSetting image restrictions ... " );
+
+ $cascade = false;
+ $restrictions = [];
+ foreach ( $title->getRestrictionTypes() as $type ) {
+ $restrictions[$type] = $protectLevel;
+ }
+
+ $page = WikiPage::factory( $title );
+ $status = $page->doUpdateRestrictions( $restrictions, [], $cascade, '', $user );
+ $this->output( ( $status->isOK() ? 'done' : 'failed' ) . "\n" );
+ }
+ } else {
+ $this->output( "failed. (at recordUpload stage)\n" );
+ $svar = 'failed';
+ }
+
+ $$svar++;
+ $processed++;
+
+ if ( $limit && $processed >= $limit ) {
+ break;
+ }
+ }
+
+ # Print out some statistics
+ $this->output( "\n" );
+ foreach (
+ [
+ 'count' => 'Found',
+ 'limit' => 'Limit',
+ 'ignored' => 'Ignored',
+ 'added' => 'Added',
+ 'skipped' => 'Skipped',
+ 'overwritten' => 'Overwritten',
+ 'failed' => 'Failed'
+ ] as $var => $desc
+ ) {
+ if ( $$var > 0 ) {
+ $this->output( "{$desc}: {$$var}\n" );
+ }
+ }
+ } else {
+ $this->output( "No suitable files could be found for import.\n" );
+ }
+ }
+
+ /**
+ * Search a directory for files with one of a set of extensions
+ *
+ * @param string $dir Path to directory to search
+ * @param array $exts Array of extensions to search for
+ * @param bool $recurse Search subdirectories recursively
+ * @return array|bool Array of filenames on success, or false on failure
+ */
+ private function findFiles( $dir, $exts, $recurse = false ) {
+ if ( is_dir( $dir ) ) {
+ $dhl = opendir( $dir );
+ if ( $dhl ) {
+ $files = [];
+ while ( ( $file = readdir( $dhl ) ) !== false ) {
+ if ( is_file( $dir . '/' . $file ) ) {
+ list( /* $name */, $ext ) = $this->splitFilename( $dir . '/' . $file );
+ if ( array_search( strtolower( $ext ), $exts ) !== false ) {
+ $files[] = $dir . '/' . $file;
+ }
+ } elseif ( $recurse && is_dir( $dir . '/' . $file ) && $file !== '..' && $file !== '.' ) {
+ $files = array_merge( $files, $this->findFiles( $dir . '/' . $file, $exts, true ) );
+ }
+ }
+
+ return $files;
+ } else {
+ return [];
+ }
+ } else {
+ return [];
+ }
+ }
+
+ /**
+ * Split a filename into filename and extension
+ *
+ * @param string $filename
+ * @return array
+ */
+ private function splitFilename( $filename ) {
+ $parts = explode( '.', $filename );
+ $ext = $parts[count( $parts ) - 1];
+ unset( $parts[count( $parts ) - 1] );
+ $fname = implode( '.', $parts );
+
+ return [ $fname, $ext ];
+ }
+
+ /**
+ * Find an auxilliary file with the given extension, matching
+ * the give base file path. $maxStrip determines how many extensions
+ * may be stripped from the original file name before appending the
+ * new extension. For example, with $maxStrip = 1 (the default),
+ * file files acme.foo.bar.txt and acme.foo.txt would be auxilliary
+ * files for acme.foo.bar and the extension ".txt". With $maxStrip = 2,
+ * acme.txt would also be acceptable.
+ *
+ * @param string $file Base path
+ * @param string $auxExtension The extension to be appended to the base path
+ * @param int $maxStrip The maximum number of extensions to strip from the base path (default: 1)
+ * @return string|bool
+ */
+ private function findAuxFile( $file, $auxExtension, $maxStrip = 1 ) {
+ if ( strpos( $auxExtension, '.' ) !== 0 ) {
+ $auxExtension = '.' . $auxExtension;
+ }
+
+ $d = dirname( $file );
+ $n = basename( $file );
+
+ while ( $maxStrip >= 0 ) {
+ $f = $d . '/' . $n . $auxExtension;
+
+ if ( file_exists( $f ) ) {
+ return $f;
+ }
+
+ $idx = strrpos( $n, '.' );
+ if ( !$idx ) {
+ break;
+ }
+
+ $n = substr( $n, 0, $idx );
+ $maxStrip -= 1;
+ }
+
+ return false;
+ }
+
+ # @todo FIXME: Access the api in a saner way and performing just one query
+ # (preferably batching files too).
+ private function getFileCommentFromSourceWiki( $wiki_host, $file ) {
+ $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
+ . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=comment';
+ $body = Http::get( $url, [], __METHOD__ );
+ if ( preg_match( '#<ii comment="([^"]*)" />#', $body, $matches ) == 0 ) {
+ return false;
+ }
+
+ return html_entity_decode( $matches[1] );
+ }
+
+ private function getFileUserFromSourceWiki( $wiki_host, $file ) {
+ $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
+ . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=user';
+ $body = Http::get( $url, [], __METHOD__ );
+ if ( preg_match( '#<ii user="([^"]*)" />#', $body, $matches ) == 0 ) {
+ return false;
+ }
+
+ return html_entity_decode( $matches[1] );
+ }
+
+}
+
+$maintClass = ImportImages::class;
+require_once RUN_MAINTENANCE_IF_MAIN;