summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/SemanticMediaWiki/src/Elastic/QueryEngine/FieldMapper.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/extensions/SemanticMediaWiki/src/Elastic/QueryEngine/FieldMapper.php')
-rw-r--r--www/wiki/extensions/SemanticMediaWiki/src/Elastic/QueryEngine/FieldMapper.php676
1 files changed, 676 insertions, 0 deletions
diff --git a/www/wiki/extensions/SemanticMediaWiki/src/Elastic/QueryEngine/FieldMapper.php b/www/wiki/extensions/SemanticMediaWiki/src/Elastic/QueryEngine/FieldMapper.php
new file mode 100644
index 00000000..ae3eafb5
--- /dev/null
+++ b/www/wiki/extensions/SemanticMediaWiki/src/Elastic/QueryEngine/FieldMapper.php
@@ -0,0 +1,676 @@
+<?php
+
+namespace SMW\Elastic\QueryEngine;
+
+use SMW\DataTypeRegistry;
+use SMW\DIProperty;
+
+/**
+ * @private
+ *
+ * @license GNU GPL v2+
+ * @since 3.0
+ *
+ * @author mwjames
+ */
+class FieldMapper {
+
+ const TYPE_MUST = 'must';
+ const TYPE_SHOULD = 'should';
+ const TYPE_MUST_NOT = 'must_not';
+ const TYPE_FILTER = 'filter';
+
+ /**
+ * @var boolean
+ */
+ private $isCompatMode = true;
+
+ /**
+ * @since 3.0
+ *
+ * @param boolean $isCompatMode
+ */
+ public function isCompatMode( $isCompatMode ) {
+ $this->isCompatMode = $isCompatMode;
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param integer $id
+ *
+ * @return string
+ */
+ public static function getPID( $id ) {
+ return "P:$id";
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param DIProperty $property
+ *
+ * @return string
+ */
+ public static function getFieldType( DIProperty $property ) {
+ return str_replace( [ '_' ], [ '' ], DataTypeRegistry::getInstance()->getFieldType( $property->findPropertyValueType() ) );
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param DIProperty $property
+ * @param string $affix
+ *
+ * @return string
+ */
+ public static function getField( DIProperty $property, $affix = 'Field' ) {
+ return self::getFieldType( $property ) . $affix;
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $value
+ *
+ * @return boolean
+ */
+ public static function isPhrase( $value = '' ) {
+ return $value{0} === '"' && substr( $value, -1 ) === '"';
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $value
+ *
+ * @return boolean
+ */
+ public static function hasWildcard( $value = '' ) {
+ return strpos( $value, '*' ) !== false && strpos( $value, '\*' ) === false;
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $value
+ *
+ * @return boolean
+ */
+ public function containsReservedChar( $value ) {
+
+ $reservedChars = [
+ '+', '-', '=', '&&', '||', '>', '<', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\', '//'
+ ];
+
+ foreach ( $reservedChars as $char ) {
+ if ( strpos( $value, $char ) !== false ) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * @see https://stackoverflow.com/questions/9796470/random-order-pagination-elasticsearch
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function function_score_random( $query, $boost = 5 ) {
+ return [
+ 'function_score' => [
+ 'query' => $query,
+ "boost" => $boost,
+ "random_score" => new \stdClass(),
+ "boost_mode"=> "multiply"
+ ]
+ ];
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param array $results
+ * @param array $params
+ *
+ * @return []
+ */
+ public function field_filter( $field, $params ) {
+
+ $idList = [];
+
+ foreach ( $params as $key => $value ) {
+
+ if ( $key === $field ) {
+ return $value;
+ }
+
+ if ( !is_array( $value ) ) {
+ return [];
+ }
+
+ return $this->field_filter( $field, $value );
+ }
+
+ return $idList;
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $type
+ * @param array $params
+ *
+ * @return array
+ */
+ public function bool( $type, $params ) {
+ return [ 'bool' => [ $type => $params ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-constant-score-query.html
+ * @see https://www.elastic.co/guide/en/elasticsearch/guide/current/filter-caching.html
+ *
+ * @since 3.0
+ *
+ * @param string $type
+ * @param array $params
+ *
+ * @return array
+ */
+ public function constant_score( $params ) {
+ return [ 'constant_score' => [ 'filter' => $params ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-filter-context.html
+ *
+ * "... filter context, a query clause ... is a simple Yes or No — no scores
+ * are calculated. Filter context is mostly used for filtering structured
+ * data ...", " ... used filters will be cached automatically ..."
+ *
+ * @since 3.0
+ *
+ * @param string $type
+ * @param array $params
+ *
+ * @return array
+ */
+ public function filter( $params ) {
+ return [ 'filter' => $params ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-geo-distance-query.html
+ *
+ * @since 3.0
+ *
+ * @param string $type
+ * @param array $params
+ *
+ * @return array
+ */
+ public function geo_distance( $field, $coordinates, $distance ) {
+ return [ 'geo_distance' => [ 'distance' => $distance, $field => $coordinates ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-geo-bounding-box-query.html
+ *
+ * @since 3.0
+ *
+ * @param string $type
+ * @param array $params
+ *
+ * @return array
+ */
+ public function geo_bounding_box( $field, $top, $left, $bottom, $right ) {
+ return [ 'geo_bounding_box' => [ $field => [ 'top' => $top , 'left' => $left, 'bottom' => $bottom, 'right' => $right ] ] ];
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function range( $field, $value, $comp = '' ) {
+
+ $comparators = [
+ SMW_CMP_LESS => 'lt',
+ SMW_CMP_GRTR => 'gt',
+ SMW_CMP_LEQ => 'lte',
+ SMW_CMP_GEQ => 'gte'
+ ];
+
+ return [
+ [ 'range' => [ "$field" => [ $comparators[$comp] => $value ] ] ]
+ ];
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function match( $field, $value, $operator = 'or' ) {
+
+ if ( is_array( $field ) ) {
+ return $this->multi_match( $field, $value );
+ }
+
+ // Is it a phrase match as in "Foo bar"?
+ if ( $value !=='' && $value{0} === '"' && substr( $value, -1 ) === '"' ) {
+ return $this->match_phrase( $field, trim( $value, '"' ) );
+ }
+
+ if ( $operator !== 'or' ) {
+ return [
+ [ 'match' => [ "$field" => [ 'query' => $value, 'operator' => $operator ] ] ]
+ ];
+ }
+
+ return [
+ [ 'match' => [ $field => $value ] ]
+ ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html
+ *
+ * - `best_fields` Finds documents which match any field, but uses the _score
+ * from the best field
+ * - `most_fields` Finds documents which match any field and combines the
+ * _score from each field
+ * - `cross_fields` Treats fields with the same analyzer as though they were
+ * one big field and looks for each word in any field
+ * - `phrase` Runs a match_phrase query on each field and combines the _score
+ * from each field
+ * - `phrase_prefix` Runs a match_phrase_prefix query on each field and
+ * combines the _score from each field
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ * @param array $params
+ *
+ * @return string
+ */
+ public function multi_match( $fields, $value, array $params = [] ) {
+
+ //return $this->multi_match( $field, trim( $value, '"' ) , [ "type" => "phrase" ] );
+
+ if ( strpos( $value, '"' ) !== false ) {
+ $value = trim( $value, '"' );
+ $params = [ "type" => "phrase" ];
+
+ if ( strpos( $value, '*' ) !== false ) {
+ $value = trim( $value, '*' );
+ $params = [ "type" => "phrase_prefix" ];
+ }
+ }
+
+ return [
+ [ 'multi_match' => [ 'fields' => $fields, 'query' => $value ] + $params ]
+ ];
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ * @param array $params
+ *
+ * @return string
+ */
+ public function match_phrase( $field, $value, array $params = [] ) {
+
+ if ( strpos( $value, '*' ) !== false ) {
+ return [
+ 'match_phrase_prefix' => [ "$field" => trim( $value, '*' ) ]
+ ];
+ }
+
+ if ( $params !== [] ) {
+ return [
+ [ 'match_phrase' => [ "$field" => [ 'query' => $value ] + $params ] ]
+ ];
+ }
+
+ return [
+ [ 'match_phrase' => [ "$field" => $value ] ]
+ ];
+ }
+
+ /**
+ * In compat mode we try to guess and normalize the query string and hereby
+ * attempt to make the search execution to match closely the SMW SQL behaviour
+ * which comes at a cost that certain ES specific constructs ((), {} etc.)
+ * cannot be used when the `compat.mode` is enabled.
+ *
+ * @since 3.0
+ *
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function query_string_compat( $value, array $params = [] ) {
+
+ $wildcard = '';
+ // $params = [];
+
+ // Reserved characters are: + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ /
+ // Failed the search with: {"error":{"root_cause":[{"type":"query_shard_exception","reason":"Failed to parse query [*{*]
+ // Use case: TQ0102
+ if ( $this->containsReservedChar( $value ) ) {
+ $value = str_replace(
+ [ '\\', '{', '}', '(', ')', '[', ']', '^', '=', '|', '/' , ':' ],
+ [ "\\\\", "\{", "\}", "\(", "\)", "\[", "\]", "\^", "\=", "\|", "\/", "\:" ],
+ $value
+ );
+ }
+
+ // Intended phrase or a single " char?
+ // Use case: TQ0102#13
+ if ( strpos( $value, '"' ) !== false && substr_count( $value, '"' ) < 2 ) {
+ $value = str_replace( '"' , '\"', $value );
+ } elseif ( substr_count( $value, '"' ) == 2 && strpos( $value, '~' ) !== false ) {
+ // https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#_fuzziness
+ // [[Has page::phrase:some text~2]] as "some text"~2
+ list( $value, $fuzziness ) = explode( '~', $value );
+ $value = "$value\"~" . str_replace( '"', '', $fuzziness );
+ }
+
+ // In this section we add modifiers to closely emulate the "known" SMW
+ // query behaviour of matching a string by SQL in terms of %foo% ...
+
+ // Uses Boolean parameters? avoid further guessing on the behalf of the
+ // query operation as in `[[Has text::~+MariaDB -database]]`
+ if ( strpos( $value, '+' ) !== false || strpos( $value, '-' ) !== false || strpos( $value, '"' ) !== false ) {
+ // Use case: `[[Has text::~sear*, -elas*]]`
+ // The user added those parameters by themselves
+ // Avoid comma separated strings
+ $value = str_replace( ',' , '', $value );
+ } elseif ( strpos( $value, ' ' ) !== false ) {
+ // Use case: `[[Has text::~some tex*]]
+ // Intention is to search for `some` AND `tex*`
+ $value = str_replace( [ ' ', '/*' ], [ ' +', '/ *' ], $value );
+ } elseif ( strpos( $value, '/' ) !== false ) {
+ // Use case: [[~Example/0608/*]]
+ // Somehow using the input as-is returns all sorts of matches mostly
+ // due to `/` being reserved hence split the string and create a
+ // conjunction using ES boolean expression `+` (AND) as in `Example`
+ // AND `0608*`
+ // T:Q0908 `http://example.org/some_title_with_a_value` becomes
+ // `example.org +some +title +with +a +value`
+ $value = str_replace( [ '\/', '/', ' +*', '_' ], [ '/', ' +', '*', ' +' ], $value );
+ } else {
+
+ // `_` in MediaWiki represents a space therefore replace it with an
+ // `+` (AND)
+ $value = str_replace( [ '_' ], [ ' ' ], $value );
+
+ // Use case: `[[Has text::~foo*]]`, `[[Has text::~foo]]`
+ // - add Boolean + which translates into "must be present"
+ if ( $value{0} !== '*' ) {
+ $value = "+$value";
+ }
+
+ // Use case: `[[Has text::~foo bar*]]`
+ if ( strpos( $value, ' ' ) !== false && substr( $value, -1 ) === '*' ) {
+ // $value = substr( $value, 0, -1 );
+ $wildcard = '*';
+ $params[ 'analyze_wildcard'] = true;
+ }
+
+ // Use case: `[[Has text::~foo bar*]]
+ // ... ( and ) signifies precedence
+ // ... " wraps a number of tokens to signify a phrase for searching
+ if ( strpos( $value, ' ' ) !== false && strpos( $value, '"' ) === false ) {
+ // $value = "\"($value)\"$wildcard";
+ }
+ }
+
+ // Force all terms to be required by ...
+ // $params[ 'default_operator'] = 'AND';
+
+ // Disable fuzzy transpositions (ab → ba)
+ // $params[ 'fuzzy_transpositions'] = false;
+
+ return [ $value, $params ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
+ * @since 3.0
+ *
+ * @param string|array $fields
+ * @param mixed $value
+ * @param array $params
+ *
+ * @return string
+ */
+ public function query_string( $fields, $value, array $params = [] ) {
+
+ if ( $this->isCompatMode ) {
+ list( $value, $params ) = $this->query_string_compat( $value, $params );
+ }
+
+ if ( !is_array( $fields ) ) {
+ $fields = [ $fields ];
+ }
+
+ return [
+ 'query_string' => [ 'fields' => $fields, 'query' => $value ] + $params
+ ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-ids-query.html
+ * @since 3.0
+ *
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function ids( $value ) {
+ return [ 'ids' => [ "values" => $value ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-term-query.html
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function term( $field, $value ) {
+ return [ 'term' => [ "$field" => $value ] ];
+ }
+
+ /**
+ * Filters documents that have fields that match any of the provided terms
+ * (not analyzed).
+ *
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-term-query.html
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function terms( $field, $value ) {
+
+ if ( !is_array( $value ) ) {
+ $value = [ $value ];
+ }
+
+ return [ 'terms' => [ "$field" => $value ] ];
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function wildcard( $field, $value ) {
+ return [ 'wildcard' => [ "$field" => $value ] ];
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param string $field
+ *
+ * @return string
+ */
+ public function exists( $field ) {
+ return [ 'exists' => [ "field" => "$field" ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.2/search-aggregations.html
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function aggs( $name, $params ) {
+ return [ 'aggregations' => [ "$name" => $params ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.2/search-aggregations-bucket-terms-aggregation.html
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function aggs_terms( $key, $field, $params = [] ) {
+ return [ $key => [ 'terms' => [ "field" => $field ] + $params ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.2/search-aggregations-bucket-significantterms-aggregation.html
+ *
+ * Aggregation based on terms that have undergone a significant change in
+ * popularity measured between a foreground and background set.
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public function aggs_significant_terms( $key, $field, $params = [] ) {
+ return [ $key => [ 'significant_terms' => [ "field" => $field ] + $params ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.2/search-aggregations-bucket-histogram-aggregation.html
+ *
+ * A multi-bucket values source based aggregation that can be applied on
+ * numeric values extracted from the documents. It dynamically builds fixed
+ * size (a.k.a. interval) buckets over the values.
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public static function aggs_histogram( $key, $field, $interval ) {
+ return [ $key => [ 'histogram' => [ "field" => $field, 'interval' => $interval ] ] ];
+ }
+
+ /**
+ * @see https://www.elastic.co/guide/en/elasticsearch/reference/6.2/search-aggregations-bucket-datehistogram-aggregation.html
+ *
+ * A multi-bucket aggregation similar to the histogram except it can only be
+ * applied on date values.
+ *
+ * @since 3.0
+ *
+ * @param string $field
+ * @param mixed $value
+ *
+ * @return string
+ */
+ public static function aggs_date_histogram( $key, $field, $interval ) {
+ return [ $key => [ 'date_histogram' => [ "field" => $field, 'interval' => $interval ] ] ];
+ }
+
+ /**
+ * @since 3.0
+ *
+ * @param Condition|array $params
+ * @param string $replacement
+ * @param array $hierarchy
+ *
+ * @return string
+ */
+ public function hierarchy( $params, $replacement, $hierarchy = [] ) {
+
+ if ( $hierarchy === [] ) {
+ return $params;
+ }
+
+ $str = is_array( $params ) ? json_encode( $params ) : (string)$params;
+
+ // P:, or iP:
+ list( $prefix, $id ) = explode( ':', $replacement );
+
+ $params = [];
+ $params[] = json_decode( $str, true );
+
+ foreach ( $hierarchy as $key ) {
+ // Quick and dirty to avoid iterating over an array and find a
+ // possible replacement without knowing the specific structure of
+ // an array
+ //
+ // Adding . to make it less likely that we replace a user value that
+ // appears as `P:42`
+ $params[] = json_decode( str_replace( "$replacement.", "$prefix:$key.", $str ), true );
+ }
+
+ $condition = new Condition( $params );
+
+ // Hierarchy as simple list of disjunctive (should) conditions where any
+ // of the condition is allowed to return a result. For example, a hierarchy
+ // defined as `Foo <- Foo1 <- Foo2` would be represented in terms of
+ // `[[Foo::bar]] OR [[Foo1::bar]] OR [[Foo2::bar]]`
+ $condition->type( 'should' );
+
+ return new Condition( $condition );
+ }
+
+}