| Server IP : 23.111.136.34 / Your IP : 216.73.216.136 Web Server : Apache System : Linux servidor.eurohost.com.br 3.10.0-1160.119.1.el7.x86_64 #1 SMP Tue Jun 4 14:43:51 UTC 2024 x86_64 User : meusitei ( 1072) PHP Version : 5.6.40 Disable Function : show_source, system, shell_exec, passthru, proc_open MySQL : ON | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : ON | Pkexec : ON Directory : /home/meusitei/public_html/wp-content/plugins/searchwp/includes/ |
Upload File : |
<?php
global $wp_filesystem;
if ( ! defined( 'ABSPATH' ) ) {
exit;
}
/** @noinspection PhpIncludeInspection */
include_once ABSPATH . 'wp-admin/includes/file.php';
/**
* Class SearchWPIndexer is responsible for generating the search index
*/
class SearchWPIndexer {
/**
* @var object Stores post object during indexing
* @since 1.0
*/
private $post;
/**
* @var bool Whether there are posts left to index
* @since 1.0
*/
public $unindexedPosts = false;
/**
* @var int The maximum weight for a single term
* @since 1.0
*/
private /** @noinspection PhpUnusedPrivateFieldInspection */
$weightLimit = 500;
/**
* @var bool Whether the indexer should index numbers
* @since 1.0
*/
private /** @noinspection PhpUnusedPrivateFieldInspection */
$indexNumbers = false;
/**
* @var int Internal counter
* @since 1.0
*/
private /** @noinspection PhpUnusedPrivateFieldInspection */
$count = 0;
/**
* @var array Common words
* @since 1.0
*/
private $common = array();
/**
* @var int Maximum number of times we should try to index a post
*/
private $maxAttemptsToIndex = 2;
/**
* @var bool Whether to index Attachments at all
*/
private $indexAttachments = false;
/**
* @var array Character entities as specified by Ando Saabas in Sphider http://www.sphider.eu/
* @since 1.0
*/
private /** @noinspection PhpUnusedPrivateFieldInspection */
$entities = array(
'&' => '&', '&apos' => "'", 'Þ' => 'Þ', 'ß' => 'ß', 'à' => 'à', 'á' => 'á',
'â' => 'â', 'ã' => 'ã', 'ä' => 'ä', 'å' => 'å', 'æ' => 'æ', 'ç' => 'ç',
'è' => 'è', 'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'í' => 'í',
'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò', 'ó' => 'ó',
'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', 'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú',
'û' => 'û', 'ü' => 'ü', 'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ',
'À' => 'à', 'Á' => 'á', 'Â' => 'â', 'Ã' => 'ã', 'Ä' => 'ä',
'Å' => 'å', '&Aelig;' => 'æ', 'Ç' => 'ç', 'È' => 'è', 'É' => 'é', 'Ê' => 'ê',
'Ë' => 'ë', 'Ì' => 'ì', 'Í' => 'í', 'Î' => 'î', 'Ï' => 'ï', 'Ð' => 'ð',
'Ñ' => 'ñ', 'Ò' => 'ò', 'Ó' => 'ó', 'Ô' => 'ô', 'Õ' => 'õ', 'Ö' => 'ö',
'Ø' => 'ø', 'Ù' => 'ù', 'Ú' => 'ú', 'Û' => 'û', 'Ü' => 'ü', 'Ý' => 'ý',
'&Yhorn;' => 'þ', 'Ÿ' => 'ÿ',
);
/**
* @var array Post IDs to forcibly exclude from indexing process
*/
private $excludeFromIndex = array();
/**
* @var array|string post type(s) to include when indexing
*/
private $postTypesToIndex = 'any';
/**
* @var string|array post status(es) to include when indexing
*
* @since 1.6.10
*/
private $post_statuses = 'publish';
/**
* @var int The maximum length of a term, as defined by the database schema
*
* @since 1.8.4
*/
private $max_term_length = 80;
/**
* @var bool Whether SearchWP will also keep track of accent-less versions of accented terms when indexing
* which allows for 'lazy' searches without accents to show accented results
*/
private $lenient_accents = false;
/**
* @var string The indexer validation hash
*/
public $hash;
/**
* @var int Threshold (in characters) to trigger big data handling
*/
private $big_data_trigger = 10000;
/**
* Constructor
*
* @param string $hash The key used to validate instantiation
* @since 1.0
*/
public function __construct( $hash = '' ) {
$searchwp = SWP();
do_action( 'searchwp_indexer_pre' );
$this->init();
if ( empty( $this->postTypesToIndex ) && empty( $this->indexAttachments ) ) {
return;
}
// If there are no initial settings, there's nothing to do
$initial_settings = searchwp_get_setting( 'initial_settings' );
if ( empty( $initial_settings ) ) {
return;
}
// make sure we've got a valid request to index
wp_cache_delete( 'searchwp_transient', 'options' );
if ( get_option( 'searchwp_transient' ) !== $hash ) {
if ( ! empty( $hash ) ) {
do_action( 'searchwp_log', 'Invalid index request ' . $hash );
} else {
do_action( 'searchwp_log', 'External SearchWPIndexer instantiation ' . $_SERVER['REQUEST_URI'] );
}
} else {
/**
* Allow for some catch-up from the last request
*/
// auto-throttle based on load
$waitTime = 1;
$waiting = false;
if ( apply_filters( 'searchwp_indexer_load_monitoring', true ) && function_exists( 'sys_getloadavg' ) ) {
$load = sys_getloadavg();
$loadThreshold = abs( apply_filters( 'searchwp_load_maximum', 2 ) );
// if the load has breached the threshold, scale the wait time
if ( $load[0] > $loadThreshold ) {
$waiting = true;
$waitTime = 4 * floor( $load[0] );
do_action( 'searchwp_log', 'Load threshold (' . $loadThreshold . ') has been breached! Current load: ' . $load[0] . '. Automatically injecting a wait time of ' . $waitTime );
// this flag is going to prevent the indexer from jumpstarting which could very well trigger parallel indexers
searchwp_update_option( 'waiting', true );
}
}
// allow developers to throttle the indexer
$waitTime = absint( apply_filters( 'searchwp_indexer_throttle', $waitTime ) );
$iniMaxExecutionTime = absint( ini_get( 'max_execution_time' ) ) - 5;
if ( $iniMaxExecutionTime < 10 ) {
$iniMaxExecutionTime = 10;
}
if ( $waitTime > $iniMaxExecutionTime ) {
do_action( 'searchwp_log', 'Requested throttle of ' . $waitTime . 's exceeds max execution time, forcing ' . $iniMaxExecutionTime . 's' );
$waitTime = $iniMaxExecutionTime;
}
$memoryUse = size_format( memory_get_usage() );
do_action( 'searchwp_log', 'Memory usage: ' . $memoryUse . ' - sleeping for ' . $waitTime . 's' );
if ( 1 === $waitTime ) {
// wait time was not adjusted, so we're just going to usleep because 1 second is an eternity
usleep( 750000 );
} else {
sleep( $waitTime );
}
if ( $waiting ) {
searchwp_update_option( 'waiting', false );
}
// see if the indexer has stalled
searchwp_check_for_stalled_indexer();
// check to see if indexer is already running
$running = searchwp_get_setting( 'running' );
if ( empty( $running ) ) {
do_action( 'searchwp_log', 'Indexer NOW RUNNING' );
searchwp_set_setting( 'last_activity', current_time( 'timestamp' ), 'stats' );
searchwp_set_setting( 'running', true );
do_action( 'searchwp_indexer_running' );
if ( apply_filters( 'searchwp_remove_pre_get_posts', true ) ) {
remove_all_actions( 'pre_get_posts' );
remove_all_filters( 'pre_get_posts' );
}
$this->update_running_counts();
if ( false !== $this->find_unindexed_posts() ) {
do_action( 'searchwp_indexer_posts' );
$start_time = time();
// index this chunk of posts
$this->index();
$index_time = time() - $start_time;
// clean up
do_action( 'searchwp_log', 'Indexing chunk complete: ' . $index_time . 's' );
searchwp_set_setting( 'running', false );
searchwp_set_setting( 'in_process', false, 'stats' );
searchwp_update_option( 'busy', false );
// reset the transient
$this->hash = sprintf( '%.22F', microtime( true ) ); // inspired by $doing_wp_cron
update_option( 'searchwp_transient', $this->hash, 'no' );
$destination = esc_url_raw( $searchwp->endpoint . '?swpnonce=' . $this->hash );
do_action( 'searchwp_log', 'Request index (internal loopback) ' . $destination );
$timeout = abs( apply_filters( 'searchwp_timeout', 0.02 ) );
// recursive trigger
$args = array(
'body' => array( 'swpnonce' => $this->hash ),
'blocking' => false,
'user-agent' => 'SearchWP',
'timeout' => $timeout,
'sslverify' => false,
);
$args = apply_filters( 'searchwp_indexer_loopback_args', $args );
do_action( 'searchwp_indexer_loopback', $args );
if ( ! apply_filters( 'searchwp_alternate_indexer', false ) ) {
wp_remote_post( $destination, $args );
}
} else {
do_action( 'searchwp_log', 'Nothing left to index' );
do_action( 'searchwp_index_up_to_date' );
$initial = searchwp_get_setting( 'initial_index_built' );
if ( empty( $initial ) ) {
wp_clear_scheduled_hook( 'swp_indexer' ); // clear out the pre-initial-index cron event
do_action( 'searchwp_log', 'Initial index complete' );
searchwp_set_setting( 'initial_index_built', true );
do_action( 'searchwp_index_initial_complete' );
}
searchwp_set_setting( 'running', false );
searchwp_set_setting( 'in_process', false, 'stats' );
searchwp_update_option( 'busy', false );
// delta updates may have been triggered, so now that the initial index has been built we can process them
$purge_queue = searchwp_get_option( 'purge_queue' );
if ( ! empty( $purge_queue ) ) {
$timeout = abs( apply_filters( 'searchwp_timeout', 0.02 ) );
// we don't need a hash because the purge queue is checked per request
$destination = esc_url_raw( $searchwp->endpoint . '?swpdeltas=swpdeltas&' . sprintf( '%.22F', microtime( true ) ) );
// recursive trigger
$args = array(
'body' => array( 'swpdeltas' => 'swpdeltas' ),
'blocking' => false,
'user-agent' => 'SearchWP',
'timeout' => $timeout,
'sslverify' => false,
);
$args = apply_filters( 'searchwp_indexer_loopback_args', $args );
do_action( 'searchwp_indexer_loopback', $args );
wp_remote_post( $destination, $args );
}
}
} else {
do_action( 'searchwp_log', 'SHORT CIRCUIT: Indexer already running' );
}
}
}
function init() {
// init
$this->common = SWP()->common;
// by default let's only grab 'enabled' post types across the board (so as to keep the index size at a minimum)
$this->postTypesToIndex = SWP()->get_enabled_post_types_across_all_engines();
$this->big_data_trigger = absint( apply_filters( 'searchwp_term_count_limit', $this->big_data_trigger ) );
$this->lenient_accents = apply_filters( 'searchwp_leinant_accents', $this->lenient_accents ); // deprecated
$this->lenient_accents = apply_filters( 'searchwp_lenient_accents', $this->lenient_accents );
// dynamically decide whether we're going to index Attachments based on whether Media is enabled for any search engine
$index_attachments_from_settings = false;
if ( in_array( 'attachment', $this->postTypesToIndex, true ) ) {
$index_attachments_from_settings = true;
}
// allow dev to completely disable indexing of Attachments to save indexing time
$this->indexAttachments = apply_filters( 'searchwp_index_attachments', $index_attachments_from_settings );
if ( ! is_bool( $this->indexAttachments ) ) {
$this->indexAttachments = false;
}
// allow dev to customize post statuses are included
$this->post_statuses = (array) apply_filters( 'searchwp_post_statuses', $this->post_statuses, null );
foreach ( $this->post_statuses as $post_status_key => $post_status_value ) {
$this->post_statuses[ $post_status_key ] = sanitize_key( $post_status_value );
}
// allow dev to forcefully omit posts from being indexed
$this->excludeFromIndex = apply_filters( 'searchwp_prevent_indexing', array() );
if ( ! is_array( $this->excludeFromIndex ) ) {
$this->excludeFromIndex = array();
}
// UPDATE @since 2.9.0 the indexer is even more restricted in that there are
// taxonomy limiters in place that can either exclude or limit to taxonomy
// terms. There's no reason to index posts that are excluded, and we can reduce
// the size of the index when limiters are in play, so we'll do that
if ( apply_filters( 'searchwp_indexer_apply_engines_rules', true ) ) {
$post__not_in = SWP()->get_post__not_in_across_all_engines( $this->excludeFromIndex );
if ( ! empty( $post__not_in ) ) {
$this->excludeFromIndex = array_merge( $this->excludeFromIndex, $post__not_in );
}
$this->excludeFromIndex = apply_filters( 'searchwp_indexer_excluded_by_rules', $this->excludeFromIndex );
}
$this->excludeFromIndex = array_map( 'absint', $this->excludeFromIndex );
$this->excludeFromIndex = array_unique( $this->excludeFromIndex );
// allow dev to forcefully omit post types that would normally be indexed
$this->postTypesToIndex = apply_filters( 'searchwp_indexed_post_types', $this->postTypesToIndex );
$this->postTypesToIndex = array_unique( $this->postTypesToIndex );
// attachments cannot be included here, to omit attachments use the searchwp_index_attachments filter
// so we have to check to make sure attachments were not included
if ( is_array( $this->postTypesToIndex ) ) {
foreach ( $this->postTypesToIndex as $key => $postType ) {
$post_type_lower = function_exists( 'mb_strtolower' ) ? mb_strtolower( $postType, 'UTF-8' ) : strtolower( $postType );
if ( 'attachment' === $post_type_lower ) {
unset( $this->postTypesToIndex[ $key ] );
}
}
} elseif ( 'attachment' === strtolower( $this->postTypesToIndex ) ) {
$this->postTypesToIndex = 'any';
}
}
/**
* Retrieve the number of rows in the main index table
*
* @return int The number of rows in the main index table
*/
public function get_main_table_row_count() {
global $wpdb;
$index_table = $wpdb->prefix . SEARCHWP_DBPREFIX . 'index';
// Try to get an estimate if possible (it's cheaper to get).
$index_table_status = $wpdb->get_row( "SHOW TABLE STATUS LIKE '{$index_table}'" );
$row_count = isset( $index_table_status->Rows ) ? absint( $index_table_status->Rows ) : 0;
// If the estimate failed, fall back to exact count.
if ( empty( $row_count ) ) {
$row_count = $wpdb->get_var( "SELECT COUNT(id) FROM {$index_table}" );
$row_count = absint( $row_count );
}
return '~' . $this->format_big_number( $row_count );
}
public function format_big_number( $n ) {
if ( $n > 1000000000000 ) {
return round( ( $n / 1000000000000 ) ) . 'Tn';
} elseif ( $n > 1000000000 ) {
return round( ( $n / 1000000000 ) ) . 'Bn';
} elseif ( $n > 1000000 ) {
return round( ( $n / 1000000 ) ) . 'M';
} elseif ( $n > 1000 ) {
return round( ( $n / 1000 ) ) . 'K';
} else {
return number_format( $n );
}
}
/**
* Determine the number of posts left to index, total post count, and how many posts have been indexed already
*
* @since 1.0
*/
function update_running_counts() {
$total = intval( $this->count_total_posts() );
$indexed = intval( $this->indexed_count() );
// edge case: if an index was performed and attachments indexed, then the user decides to disable
// the indexing of attachments, the indexed count could potentially be greater than the total
if ( $indexed > $total ) {
$indexed = $total;
}
$remaining = intval( $total - $indexed );
searchwp_set_setting( 'total', $total, 'stats' );
searchwp_set_setting( 'remaining', $remaining, 'stats' );
searchwp_set_setting( 'done', $indexed, 'stats' );
$percent_progress = ( $total > 0 ) ? ( ( $total - $remaining ) / $total ) * 100 : 0;
$percent_progress = number_format( $percent_progress, 2, '.', '' );
searchwp_update_option( 'progress', $percent_progress );
do_action( 'searchwp_log', 'Updating counts: ' . $total . ' ' . $remaining . ' ' . $indexed );
if ( $remaining < 1 ) {
do_action( 'searchwp_log', 'Setting initial' );
searchwp_set_setting( 'initial_index_built', true );
}
}
/**
* Sets post property
*
* @param $post object WordPress Post object
* @since 1.0
*/
function set_post( $post ) {
$this->post = apply_filters( 'searchwp_pre_set_post', $post );
// append Custom Field data
$this->post->custom = get_post_custom( $post->ID );
// roll our own maybe_unserialize_deep
foreach ( $this->post->custom as $meta_key => $meta_value ) {
if ( is_array( $meta_value ) ) {
foreach ( $meta_value as $key => $val ) {
$meta_value[ $key ] = maybe_unserialize( $val );
}
}
$this->post->custom[ $meta_key ] = $meta_value;
}
// Support Gutenberg blocks. We do this before Shortcodes because
// block parsing will parse Shortcodes when Shortcode blocks are used.
if ( function_exists( 'has_blocks' ) && function_exists( 'do_blocks' ) ) {
$do_blocks = apply_filters( 'searchwp_do_blocks', true, array(
'post' => $post,
'field' => 'post_content',
) );
if ( $do_blocks && has_blocks( $this->post->post_content ) ) {
$this->post->post_content = do_blocks( $this->post->post_content );
}
}
// allow dev the option to parse Shortcodes
if ( apply_filters( 'searchwp_do_shortcode', false, $post, 'post_content', false ) ) {
$this->post->post_content = do_shortcode( $this->post->post_content );
}
if ( ! empty( $this->post->custom ) ) {
foreach ( $this->post->custom as $post_custom_key => $post_custom_value ) {
if ( apply_filters( 'searchwp_do_shortcode', false, $post, 'custom_field', $post_custom_key ) ) {
$this->post->custom[ $post_custom_key ] = $this->do_shortcode_deep( $post_custom_value );
}
}
}
// allow developer the ability to manually manipulate the post content or Custom Field data
$this->post = apply_filters( 'searchwp_set_post', $this->post );
}
/**
* Process potential Shortcodes
*
* @since 2.6.2
*
* @param $content
*
* @return string
*/
function do_shortcode_deep( $content ) {
if ( is_array( $content ) ) {
foreach ( $content as $key => $val ) {
$content[ $key ] = $this->do_shortcode_deep( $val );
}
} elseif ( is_string( $content ) ) {
$content = do_shortcode( $content );
}
return $content;
}
/**
* Count the total number of posts in this WordPress installation
*
* @return int Total number of posts
* @since 1.0
*/
function count_total_posts() {
$total_posts = 0;
if ( ! empty( $this->indexAttachments ) && ! in_array( 'attachment', $this->postTypesToIndex ) ) {
$this->postTypesToIndex[] = 'attachment';
}
if ( empty( $this->postTypesToIndex ) ) {
return $total_posts;
}
foreach ( $this->postTypesToIndex as $post_type ) {
$args = array(
'posts_per_page' => 1,
'post_type' => $post_type,
'post_status' => $post_type === 'attachment' ? 'inherit' : $this->post_statuses,
'post__not_in' => $this->excludeFromIndex,
'suppress_filters' => true,
'cache_results' => false,
'meta_query' => array(
array(
'key' => '_' . SEARCHWP_PREFIX . 'skip',
'value' => '', // only want media that hasn't failed indexing multiple times
'compare' => 'NOT EXISTS',
'type' => 'BINARY',
),
),
);
// See note in find_unindexed_posts about 2.9.0
if ( apply_filters( 'searchwp_indexer_apply_engines_rules', true ) ) {
$args['tax_query'] = SWP()->get_post_type_tax_query_for_rules( $post_type, 'limit_to' );
}
// allow devs to have more control over what is considered unindexed
if ( 'attachment' !== $post_type ) {
$args = apply_filters( 'searchwp_indexer_unindexed_args', $args );
} else {
// Apply mime type exclusions if applicable
$limited_mime_types = $this->get_global_mime_limit();
if ( ! empty( $limited_mime_types ) ) {
$args['post_mime_type'] = $limited_mime_types;
}
$args = apply_filters( 'searchwp_indexer_unindexed_media_args', $args );
}
$total_post_type_ref = new WP_Query( $args );
$total_post_type = absint( $total_post_type_ref->found_posts );
$total_posts += $total_post_type;
}
return $total_posts;
}
/**
* Count the number of posts that have been indexed
*
* @return int Number of posts that have been indexed
* @since 1.0
*/
function indexed_count() {
$postTypesToCount = $this->postTypesToIndex;
if ( $this->indexAttachments && ! in_array( 'attachment', $this->postTypesToIndex ) ) {
$postTypesToCount[] = 'attachment';
}
$args = array(
'posts_per_page' => 1,
'post_type' => $postTypesToCount,
'post_status' => $this->post_statuses,
'suppress_filters' => true,
'cache_results' => false,
'meta_query' => array(
'relation' => 'AND',
array(
'key' => '_' . SEARCHWP_PREFIX . 'last_index',
'compare' => 'EXISTS',
'type' => 'NUMERIC',
),
array(
'key' => '_' . SEARCHWP_PREFIX . 'skip',
'value' => '', // only want media that hasn't failed indexing multiple times
'compare' => 'NOT EXISTS',
'type' => 'BINARY',
)
),
// TODO: should we include 'exclude_from_search' for accuracy?
);
if ( $this->indexAttachments ) {
$args['post_status'] = 'any';
}
$indexed = new WP_Query( $args );
return absint( $indexed->found_posts );
}
/**
* Query for posts that have not been indexed yet
*
* @return array|bool Posts (max 10) that have yet to be indexed
* @since 1.0
*/
function find_unindexed_posts() {
$indexChunk = apply_filters( 'searchwp_index_chunk_size', 10 );
// Media will be done last
if ( ! empty( $this->indexAttachments ) && ! in_array( 'attachment', $this->postTypesToIndex ) ) {
$this->postTypesToIndex[] = 'attachment';
}
if ( empty( $this->postTypesToIndex ) ) {
return false;
}
foreach ( $this->postTypesToIndex as $post_type ) {
$args = array(
'posts_per_page' => intval( $indexChunk ),
'post_type' => $post_type,
'post_status' => $post_type === 'attachment' ? 'inherit' : $this->post_statuses,
'post__not_in' => $this->excludeFromIndex,
'suppress_filters' => true,
'cache_results' => false,
'no_found_rows' => true,
'meta_query' => array(
'relation' => 'AND',
array(
'key' => '_' . SEARCHWP_PREFIX . 'last_index',
'value' => '', // http://core.trac.wordpress.org/ticket/23268
'compare' => 'NOT EXISTS',
'type' => 'NUMERIC',
),
array( // only want media that hasn't failed indexing multiple times
'key' => '_' . SEARCHWP_PREFIX . 'skip',
'compare' => 'NOT EXISTS',
'type' => 'BINARY',
),
array( // if a PDF was flagged during indexing, we don't want to keep trying
'key' => '_' . SEARCHWP_PREFIX . 'review',
'compare' => 'NOT EXISTS',
'type' => 'BINARY',
),
),
);
// TODO if searching in the admin is enabled, should we exclude anything? what if
// site owner wants unrestricted search in the admin, but restricted on the front end?
// @since 2.9.0 the index can be limited to certain taxonomy terms which could
// greatly reduce the overall index size in certain circumstances but this requires that we
// iterate through each post type to deterine what should be indexed
if ( apply_filters( 'searchwp_indexer_apply_engines_rules', true ) ) {
$args['tax_query'] = SWP()->get_post_type_tax_query_for_rules( $post_type, 'limit_to' );
}
// allow devs to have more control over what is considered unindexed
if ( 'attachment' !== $post_type ) {
$args = apply_filters( 'searchwp_indexer_unindexed_args', $args );
} else {
// Apply mime type exclusions if applicable
$limited_mime_types = $this->get_global_mime_limit();
if ( ! empty( $limited_mime_types ) ) {
$args['post_mime_type'] = $limited_mime_types;
}
$args = apply_filters( 'searchwp_indexer_unindexed_media_args', $args );
}
$unindexedPosts = get_posts( $args );
if ( ! empty( $unindexedPosts ) ) {
$this->unindexedPosts = $unindexedPosts;
break;
}
}
return $this->unindexedPosts;
}
function get_global_mime_limit() {
$limit = array();
$engines = isset( SWP()->settings['engines'] ) ? SWP()->settings['engines'] : array();
if ( empty( $engines ) ) {
return $limit;
}
$index_all_mimes = false;
foreach ( $engines as $engine => $engine_settings ) {
foreach ( $engine_settings as $post_type => $post_type_settings ) {
if ( ! isset( $post_type_settings['enabled'] ) || empty( $post_type_settings['enabled'] ) ) {
continue;
}
if ( 'attachment' !== $post_type ) {
continue;
}
$mimes_for_this_engine = isset( $post_type_settings['options']['mimes'] ) ? $post_type_settings['options']['mimes'] : '';
$mimes_for_this_engine_string = (string) $mimes_for_this_engine;
// This check is a bit strange because the All Documents mime group is represented by string '0'
if ( empty( $mimes_for_this_engine ) && '' === trim( $mimes_for_this_engine_string ) ) {
// If there are no limiters we have to index all mime types
$index_all_mimes = true;
break;
}
// Store these mime limits for this engine, because we need GLOBAL rules
// in other words if there are multiple engines but mime limits only on one engine,
// we cannot limit the mime type in the index because the other engine will be missing results
$limit[ $engine ] = $mimes_for_this_engine;
}
if ( ! empty( $index_all_mimes ) ) {
break;
}
}
// If at some point we determined that all mimes need to be index, bail out
if ( ! empty( $index_all_mimes ) ) {
return array();
}
// We need to find GLOBAL mime limits across all engines
// So we'll be mashing all of the engine mime limits together
$global_limit = array();
foreach ( $limit as $engine => $mime_ids ) {
if ( false !== strpos( $mime_ids, ',' ) ) {
$mime_ids = explode( ',', $mime_ids );
} else {
$mime_ids = array( $mime_ids );
}
$global_limit = array_merge( $global_limit, $mime_ids );
}
$global_limit = array_map( 'absint', $global_limit );
$global_limit = array_unique( $global_limit );
// This query arg needs the actual mime type(s), not the IDs SearchWP uses in its settings
$global_limit = SWP()->get_mimes_from_settings_ids( $global_limit );
return $global_limit;
}
/**
* Checks the stored in-process post IDs and existing index to ensure a rogue parallel indexer is not running
*
* @since 1.9
*/
function check_for_parallel_indexer() {
global $wpdb;
if ( is_array( $this->unindexedPosts ) && count( $this->unindexedPosts ) ) {
// prevent parallel indexers
$ids_to_index = array();
foreach ( $this->unindexedPosts as $unindexed_post ) {
$ids_to_index[] = (int) $unindexed_post->ID;
}
reset( $this->unindexedPosts );
// check what's in process *right now*
$in_process = searchwp_get_setting( 'in_process', 'stats' );
if ( is_array( $in_process ) ) {
$in_process = array_intersect( $ids_to_index, $in_process );
}
// check the index too
$ids_to_index = array_map( 'absint', $ids_to_index );
$ids_to_index_sql = implode( ',', $ids_to_index );
$index_table = $wpdb->prefix . SEARCHWP_DBPREFIX . 'index';
$ids_to_index_sql = "SELECT post_id FROM {$index_table} WHERE post_id IN ({$ids_to_index_sql}) GROUP BY post_id LIMIT 100";
$already_indexed = $wpdb->get_col( $ids_to_index_sql );
$already_indexed = array_map( 'absint', $already_indexed );
// if it's in the index, force the indexed flag
if ( is_array( $already_indexed ) && ! empty( $already_indexed ) ) {
foreach ( $already_indexed as $already_indexed_key => $already_indexed_id ) {
$remaining_terms = get_post_meta( (int) $already_indexed_id, '_' . SEARCHWP_PREFIX . 'terms', true );
if ( $remaining_terms ) {
if ( is_array( $remaining_terms ) ) {
$remaining_count = count( $remaining_terms );
} else {
$remaining_count = 'N/A';
}
do_action( 'searchwp_log', 'Post ' . (int) $already_indexed_id . ' is being chunked, ' . $remaining_count . ' remain');
// Reset the attempt count
$count = get_post_meta( (int) $already_indexed_id, '_' . SEARCHWP_PREFIX . 'attempts', true );
if ( false === $count ) {
$count = 0;
} else {
$count = intval( $count );
}
$count--;
// increment our counter to prevent the indexer getting stuck on a gigantic PDF
update_post_meta( (int) $already_indexed_id, '_' . SEARCHWP_PREFIX . 'attempts', $count );
} else {
do_action( 'searchwp_log', 'Post ' . (int) $already_indexed_id . ' is already in the index' );
}
// if we're not dealing with a term queue, mark this post as indexed
if ( ! $remaining_terms ) {
update_post_meta( (int) $already_indexed_id, '_' . SEARCHWP_PREFIX . 'last_index', current_time( 'timestamp' ) );
} else {
// this is a term chunk update, not a conflict
unset( $already_indexed[ $already_indexed_key ] );
}
}
}
// combine the two results so we have one collection of conflicts
$conflicts = is_array( $in_process ) ? array_values( array_merge( (array) $in_process, (array) $already_indexed ) ) : (array) $already_indexed;
if ( ! empty( $conflicts ) ) {
do_action( 'searchwp_log', 'Parallel indexer detected when attempting to index: ' . implode( ', ', $conflicts ) );
die();
}
searchwp_set_setting( 'in_process', $ids_to_index, 'stats' );
}
}
/**
* Extract PDF meta (PHP 5.3+)
*
* @deprecated in version 2.8
*
* @param $post_id
*
* @return array
*/
function extract_pdf_metadata( $post_id ) {
$parser = new SearchWPDocumentParser( $post_id );
return $parser->extract_pdf_metadata( $post_id );
}
/**
* Extract plain text from PDF
*
* @deprecated in version 2.8
*
* @param $post_id
*
* @return string
*/
function extract_pdf_text( $post_id ) {
$parser = new SearchWPDocumentParser( $post_id );
return $parser->extract_pdf_text( $post_id );
}
/**
* Index posts stored in $this->unindexedPosts
*
* @since 1.0
*/
function index() {
$this->check_for_parallel_indexer();
if ( is_array( $this->unindexedPosts ) && count( $this->unindexedPosts ) ) {
do_action( 'searchwp_indexer_pre_chunk', $this->unindexedPosts );
// all of the IDs to index have not been indexed, proceed with indexing them
while ( ( $unindexedPost = current( $this->unindexedPosts ) ) !== false ) {
$this->set_post( $unindexedPost );
// log the attempt
$count = get_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts', true );
if ( false === $count ) {
$count = 0;
} else {
$count = intval( $count );
}
$count++;
// increment our counter to prevent the indexer getting stuck on a gigantic PDF
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts', $count );
do_action( 'searchwp_log', 'Attempt ' . $count . ' at indexing ' . $this->post->ID );
// if we breached the maximum number of attempts, flag it to skip
$this->maxAttemptsToIndex = absint( apply_filters( 'searchwp_max_index_attempts', $this->maxAttemptsToIndex ) );
if ( intval( $count ) > $this->maxAttemptsToIndex ) {
do_action( 'searchwp_log', 'Too many indexing attempts on ' . $this->post->ID . ' (' . $this->maxAttemptsToIndex . ') - skipping' );
// flag it to be skipped
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'skip', true );
} else {
// check to see if we're running a second pass on terms
$termCache = false;
$term_cache_chunks = get_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'terms', false );
// the term cache is chunked in case of big data so put it back together
if ( is_array( $term_cache_chunks ) && ! empty( $term_cache_chunks ) ) {
$termCache = array();
foreach ( $term_cache_chunks as $term_cache_chunk ) {
$termCache = array_merge( $termCache, $term_cache_chunk );
}
}
if ( ! is_array( $termCache ) ) {
do_action( 'searchwp_index_post', $this->post );
// if it's an attachment, we want the permalink
$slug = 'attachment' === $this->post->post_type ? str_replace( get_bloginfo( 'wpurl' ), '', get_permalink( $this->post->ID ) ) : '';
// we allow users to override the extracted content from documents, if they have done so this flag is set
$skipDocProcessing = get_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'skip_doc_processing', true );
// The default for whether we process documents should be based on whether document
// content or PDF metadata has been enabled in any engine.
$omit_parsing_default = SWP()->is_used_meta_key( 'searchwp_content', $this->post ) || SWP()->is_used_meta_key( 'searchwp_pdf_metadata', $this->post );
$omitDocProcessing = apply_filters( 'searchwp_omit_document_processing', ! $omit_parsing_default );
// storage
$pdf_metadata = '';
if ( 'attachment' === $this->post->post_type && ! $skipDocProcessing && ! $omitDocProcessing ) {
$parser = new SearchWPDocumentParser( $this->post->ID );
// Check for existing document content in case this is an index rebuilt and the PDF
// parsing already happened, we can use that here instead and save the trouble
$document_content = get_post_meta( $this->post->ID, SEARCHWP_PREFIX . 'content', true );
if ( empty( $document_content ) ) {
$document_content = $parser->extract_document_content();
}
if ( false === $document_content ) {
// flag it for further review
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'review', true );
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'skip', true );
} else {
$document_content = trim( $document_content );
if ( ! empty( $document_content ) ) {
if ( function_exists( 'mb_convert_encoding' ) ) {
$is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true );
if ( $is_utf8 ) {
$document_content = mb_convert_encoding( $document_content, 'UTF-8' );
}
}
$document_content = sanitize_text_field( $document_content );
delete_post_meta( $this->post->ID, SEARCHWP_PREFIX . 'content' );
update_post_meta( $this->post->ID, SEARCHWP_PREFIX . 'content', $document_content );
}
}
// if it's a PDF, document the PDF metadata
if ( 'application/pdf' === $this->post->post_mime_type ) {
$pdf_metadata = $parser->extract_pdf_metadata( $this->post->ID );
if ( false !== $pdf_metadata ) {
// allow developers to filter the metadata
$pdf_metadata = apply_filters( 'searchwp_pdf_metadata', $pdf_metadata, $this->post->ID );
// allow developers to store metadata as they wish
do_action( 'searchwp_index_pdf_metadata', $pdf_metadata, $this->post->ID );
delete_post_meta( $this->post->ID, SEARCHWP_PREFIX . 'pdf_metadata' );
update_post_meta( $this->post->ID, SEARCHWP_PREFIX . 'pdf_metadata', $pdf_metadata );
}
}
}
$postTerms = array();
$postTerms['title'] = $this->index_title();
$postTerms['slug'] = $this->index_slug( str_replace( '/', ' ', $slug ) );
$postTerms['content'] = $this->index_content();
$postTerms['excerpt'] = $this->index_excerpt();
if ( apply_filters( 'searchwp_index_comments', true ) ) {
$postTerms['comments'] = $this->index_comments();
}
// index taxonomies
$taxonomies = get_object_taxonomies( $this->post->post_type );
// let devs filter which taxonomies should be indexed for this post
$taxonomies = apply_filters( 'searchwp_indexer_taxonomies', $taxonomies, $this->post );
if ( ! empty( $taxonomies ) ) {
while ( ( $taxonomy = current( $taxonomies ) ) !== false ) {
// if there's no weight, it's meaningless
$used_taxonomy = SWP()->is_used_taxonomy( $taxonomy );
if ( $used_taxonomy ) {
$terms = get_the_terms( $this->post->ID, $taxonomy );
$terms = apply_filters( 'searchwp_indexer_taxonomy_terms', $terms, $taxonomy, $this->post );
if ( ! empty( $terms ) ) {
$postTerms['taxonomy'][ $taxonomy ] = $this->index_taxonomy_terms( $taxonomy, $terms );
}
}
next( $taxonomies );
}
reset( $taxonomies );
}
// index custom fields
$customFields = apply_filters( 'searchwp_get_custom_fields', $this->post->custom, $this->post->ID );
// if it was a PDF let's ensure that our content is in the list
if ( ! empty( $document_content ) && is_array( $customFields ) && ! array_key_exists( 'searchwp_content', $customFields ) ) {
$customFields['searchwp_content'] = $document_content;
}
if ( ! empty( $pdf_metadata ) ) {
$customFields['searchwp_pdf_metadata'] = $pdf_metadata;
}
// When SearchWP initially loads Custom Fields it uses get_post_custom() which forces everything
// into arrays, but in our case we have a couple of Custom Fields that we assume are strings.
// Due to backwards compatibility we can't globally munge these Custom Field values, but we
// are going to force our internal values to be what we expect.
if (
isset( $customFields['searchwp_content'] )
&& is_array( $customFields['searchwp_content'] )
&& count( $customFields['searchwp_content'] ) === 1
&& isset( $customFields['searchwp_content'][0] )
) {
$customFields['searchwp_content'] = $customFields['searchwp_content'][0];
}
if (
isset( $customFields['searchwp_pdf_metadata'] )
&& is_array( $customFields['searchwp_pdf_metadata'] )
&& count( $customFields['searchwp_pdf_metadata'] ) === 1
&& isset( $customFields['searchwp_pdf_metadata'][0] )
) {
$customFields['searchwp_pdf_metadata'] = $customFields['searchwp_pdf_metadata'][0];
}
// reset document content and text content to prevent it from being used on subsequent index calls for this chunk
$document_content = '';
/** @noinspection PhpUnusedLocalVariableInspection */
$pdf_metadata = '';
$excluded_meta_keys = searchwp_get_excluded_meta_keys();
if ( ! empty( $customFields ) ) {
while ( ( $customFieldValue = current( $customFields ) ) !== false ) {
$customFieldName = key( $customFields );
// allow developers to conditionally omit specific custom fields
$excluded_by_engine = ! SWP()->is_used_meta_key( $customFieldName, $this->post );
// Additional processing (e.g. oembeds have their own hashed meta key)
if ( empty( $excluded_by_engine ) && apply_filters( 'searchwp_indexer_additional_meta_exclusions', true ) ) {
$excluded_by_engine = 0 === strpos( $customFieldName, '_oembed_' );
}
$omit_this_custom_field = apply_filters( 'searchwp_omit_meta_key', $excluded_by_engine, $customFieldName, $this->post );
$omit_this_custom_field = apply_filters( "searchwp_omit_meta_key_{$customFieldName}", $omit_this_custom_field, $this->post );
if ( ! in_array( $customFieldName, $excluded_meta_keys, true ) && ! $omit_this_custom_field ) {
// allow devs to swap out their own content
// e.g. parsing ACF Relationship fields (that store only post IDs) to actually retrieve that content at runtime
$customFieldValue = apply_filters( 'searchwp_custom_fields', $customFieldValue, $customFieldName, $this->post );
$customFieldValue = apply_filters( "searchwp_custom_field_{$customFieldName}", $customFieldValue, $this->post );
$postTerms['customfield'][ $customFieldName ] = $this->index_custom_field( $customFieldName, $customFieldValue );
}
next( $customFields );
}
reset( $customFields );
}
// allow developer to store arbitrary information a la Custom Fields (without them actually being Custom Fields)
$extraMetadata = apply_filters( 'searchwp_extra_metadata', false, $this->post );
if ( $extraMetadata ) {
if ( is_array( $extraMetadata ) ) {
foreach ( $extraMetadata as $extraMetadataKey => $extraMetadataValue ) {
// TODO: make sure there are no collisions?
// while( isset( $postTerms['customfield'][$extraMetadataKey] ) ) {
// $extraMetadataKey .= '_';
// }
$postTerms['customfield'][ $extraMetadataKey ] = $this->index_custom_field( $extraMetadataKey, $extraMetadataValue );
// By default extra metadata lives only in the SearchWP index. As of SearchWP
// 3.1 quoted searches are supported, but only if the match exists in a database
// table. This hook makes it easy to opt-in to that by persisting the extra
// metadata to the postmeta table, but outside of the SearchWP engine config.
$persist_extra_metadata = apply_filters( 'searchwp_persist_extra_metadata', false );
if ( $persist_extra_metadata ) {
// Each extra metadata key needs its own record in the database because
// at the very least SearchWP doesn't want to globally perform
// quoted search matches against what may be a false positive.
update_post_meta(
$this->post->ID,
'_' . SEARCHWP_PREFIX . 'extra_metadata_' . $extraMetadataKey,
$extraMetadataValue
);
}
}
}
}
// we need to break out the terms from all of this content
$termCountBreakout = array();
if ( is_array( $postTerms ) && count( $postTerms ) ) {
foreach ( $postTerms as $type => $terms ) {
switch ( $type ) {
case 'title':
case 'slug':
case 'content':
case 'excerpt':
case 'comments':
if ( is_array( $terms ) && count( $terms ) ) {
foreach ( $terms as $term ) {
$term_id = '_' . md5( $term['term'] );
// make sure the array has a key for this term
if ( ! isset( $termCountBreakout[ $term_id ] ) ) {
$termCountBreakout[ $term_id ] = array(
'term' => $term['term'],
'counts' => array(),
);
}
// make sure the counts array for this term has a key for this type
if ( ! isset( $termCountBreakout[ $term_id ]['counts'][ $type ] ) ) {
$termCountBreakout[ $term_id ]['counts'][ $type ] = array();
}
// add the counts for this term for this type
$termCountBreakout[ $term_id ]['counts'][ $type ] = absint( $term['count'] );
}
}
break;
case 'taxonomy':
case 'customfield':
if ( is_array( $terms ) && count( $terms ) ) {
foreach ( $terms as $name => $nameTerms ) {
if ( is_array( $nameTerms ) && count( $nameTerms ) ) {
foreach ( $nameTerms as $nameTerm ) {
$term_id = '_' . md5( $nameTerm['term'] );
// make sure the array has a key for this term
if ( ! isset( $termCountBreakout[ $term_id ] ) ) {
$termCountBreakout[ $term_id ] = array(
'term' => $nameTerm['term'],
'counts' => array(),
);
}
// make sure the counts array for this term has a key for this type
if ( ! isset( $termCountBreakout[ $term_id ]['counts'][ $type ] ) ) {
$termCountBreakout[ $term_id ]['counts'][ $type ] = array();
}
// make sure the type key has an array for the name
if ( ! isset( $termCountBreakout[ $term_id ]['counts'][ $type ][ $name ] ) ) {
$termCountBreakout[ $term_id ]['counts'][ $type ][ $name ] = array();
}
// add the counts for this term for this type
$termCountBreakout[ $term_id ]['counts'][ $type ][ $name ] = absint( $nameTerm['count'] );
}
}
}
}
break;
}
}
}
} else {
$termCountBreakout = $termCache;
// if there was a term cache, this repeated processing doesn't count, so decrement it
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts' );
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'skip' );
}
// unless the term chunk limit says otherwise, we're going to flag this as being OK to log as indexed
$flagAsIndexed = true;
// we now have a multidimensional array of terms with counts per type in $termCountBreakout
// if the term count is huge, we need to split up this process so as to avoid
// hitting upper PHP execution time limits (term insertion is heavy), so we'll chunk the array of terms
$termChunkMax = 500;
// try to set a better default based on php.ini's memory_limit
$memoryLimit = ini_get( 'memory_limit' );
if ( preg_match( '/^(\d+)(.)$/', $memoryLimit, $matches ) ) {
if ( 'M' === $matches[2] ) {
$termChunkMax = ( (int) $matches[1] ) * 7; // 7 terms per MB RAM
} else {
// memory was set in K...
$termChunkMax = 100;
}
}
$termChunkLimit = apply_filters( 'searchwp_process_term_limit', $termChunkMax );
if ( count( $termCountBreakout ) > $termChunkLimit ) {
$acceptableTermCountBreakout = array_slice( $termCountBreakout, 0, $termChunkLimit, true );
// if we haven't pulled all of the terms, we can't consider this post indexed...
if ( $termChunkLimit < count( $termCountBreakout ) - 1 ) {
$flagAsIndexed = false;
// save the term breakout so we don't have to do it again
$remainingTerms = array_slice( $termCountBreakout, $termChunkLimit, null, true );
// we could be dealing with big data (i.e. parsed document) so we need to chunk
// the array of remaining terms as well, else we hit limits in update_post_meta()
$remaining_terms_chunks = array_chunk( $remainingTerms, $termChunkLimit, true );
unset( $remainingTerms );
// clear out any existing cache
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'terms' );
// add our chunks
foreach ( $remaining_terms_chunks as $key => $remaining_terms_chunk ) {
add_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'terms', $remaining_terms_chunk );
unset( $remaining_terms_chunks[ $key ] );
}
}
// set the acceptable breakout as the main breakout
$termCountBreakout = $acceptableTermCountBreakout;
unset( $acceptableTermCountBreakout );
}
// there's a chance that all of the terms were filtered out and if there
// is nothing to index this post would never be flagged to skip resulting
// in an endless indexer loop
if ( ! empty( $termCountBreakout ) ) {
$terms_recorded = $this->record_post_terms( $termCountBreakout );
unset( $termCountBreakout );
// flag the post as indexed
if ( $flagAsIndexed ) {
// clean up our stored term array if necessary
if ( $termCache ) {
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'terms' );
}
// clean up the attempt counter
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts' );
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'skip' );
// flag as indexed (if terms were successfully indexed)
if ( false !== $terms_recorded ) {
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'last_index', current_time( 'timestamp' ) );
}
}
} else {
// there were no terms so we need to skip this post by flagging it as indexed
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts' );
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'skip' );
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'last_index', current_time( 'timestamp' ) );
}
}
next( $this->unindexedPosts );
}
reset( $this->unindexedPosts );
do_action( 'searchwp_indexer_post_chunk' );
}
}
/**
* Insert an array of terms into the terms table and retrieve all term IDs from submitted terms
*
* @since 1.0
*
* @param array $termsArray
*
* @return array
*/
function pre_process_terms( $termsArray = array() ) {
global $wpdb;
if ( ! is_array( $termsArray ) || empty( $termsArray ) ) {
return array();
}
// get our database vars prepped
$termsTable = $wpdb->prefix . SEARCHWP_DBPREFIX . 'terms';
$stemmer = new SearchWPStemmer();
$terms = $newTerms = $newTermsSQL = array();
while ( ( $counts = current( $termsArray ) ) !== false ) {
$termToAdd = (string) $counts['term'];
// WordPress 4.2 added emoji support which caused problems for the array storage
// of terms and their term counts since the terms themselves were array keys
// and PHP doesn't allow emoji in array keys so the array keys were switched to
// an underscore-prefixed md5 value and the term stored within that
// generate the reverse (UTF-8)
preg_match_all( '/./us', $termToAdd, $contentr );
$revTerm = join( '', array_reverse( $contentr[0] ) );
// find the stem
$unstemmed = $termToAdd;
$maybeStemmed = apply_filters( 'searchwp_custom_stemmer', $unstemmed );
// if the term was stemmed via the filter use it, else generate our own
$stem = ( $unstemmed === $maybeStemmed ) ? $stemmer->stem( $termToAdd ) : $maybeStemmed;
// store the record
$terms[] = $wpdb->prepare( '%s', $termToAdd );
$newTermsSQL[] = '(%s,%s,%s)';
$newTerms = array_merge( $newTerms, array( $termToAdd, $revTerm, $stem ) );
next( $termsArray );
}
reset( $termsArray );
// insert all of the terms into the terms table so each gets an ID
$attemptCount = 1;
$maxAttempts = absint( apply_filters( 'searchwp_indexer_max_attempts', 4 ) ) + 1; // try to recover 5 times
$insert_sql = $wpdb->prepare( "INSERT IGNORE INTO {$termsTable} (term,reverse,stem) VALUES " . implode( ',', $newTermsSQL ), $newTerms );
$insert_result = $wpdb->query( $insert_sql );
while ( ( is_wp_error( $insert_result ) || false === $insert_result ) && $attemptCount < $maxAttempts ) {
// sometimes a deadlock can happen, wait a second then try again
do_action( 'searchwp_log', 'INSERT Deadlock ' . $attemptCount . '/' . $maxAttempts );
sleep( 3 );
$attemptCount++;
// try the insert again
$insert_result = $wpdb->query( $insert_sql );
}
// deadlocking could be a red herring, there's a remote chance the database table
// doesn't even exist, so we need to handle that
if ( ( is_wp_error( $insert_result ) || false === $insert_result ) ) {
do_action( 'searchwp_log', 'Post failed indexing, flagging ' . $this->post->ID );
// this will call out this post as problematic in the WP admin
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts', absint( $this->maxAttemptsToIndex ) + 1 );
update_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'skip', true );
delete_post_meta( $this->post->ID, '_' . SEARCHWP_PREFIX . 'last_index' );
die(); // this is only an issue if there was a catastrophic problem (e.g. database tables didn't exist)
} elseif ( $attemptCount > 1 ) {
do_action( 'searchwp_log', 'Recovered from Deadlock at ' . $attemptCount . '/' . $maxAttempts );
}
// retrieve IDs for all terms
$terms_sql = "-- noinspection SqlDialectInspection
SELECT id, term FROM {$termsTable} WHERE term IN( " . implode( ',', $terms ) . ' )'; // already prepared earlier in this method
$termIDs = $wpdb->get_results( $terms_sql, 'OBJECT_K' );
// match term IDs to original terms with counts
if ( is_array( $termIDs ) ) {
while ( ( $termIDMeta = current( $termIDs ) ) !== false ) {
/** @noinspection PhpUnusedLocalVariableInspection */
$termID = key( $termIDs );
// append the term ID to the original $termsArray
while ( ( $counts = current( $termsArray ) ) !== false ) {
$termsArrayTerm = (string) $counts['term'];
if ( $termsArrayTerm === $termIDMeta->term ) {
$term_id = '_' . md5( $termIDMeta->term );
if ( isset( $termIDMeta->id ) ) {
$termsArray[ $term_id ]['id'] = absint( $termIDMeta->id );
}
break;
}
next( $termsArray );
}
reset( $termsArray );
next( $termIDs );
}
reset( $termIDs );
}
return $termsArray;
}
/**
* Insert terms with counts into the database
*
* @param array $termsArray The terms to insert
* @return bool Whether the insert was successful
* @since 1.0
*/
function record_post_terms( $termsArray = array() ) {
global $wpdb;
if ( ! is_array( $termsArray ) || empty( $termsArray ) ) {
return false;
}
$success = true; // track whether or not the database insert went okay
// get our database vars prepped
$termsTable = $wpdb->prefix . SEARCHWP_DBPREFIX . 'terms';
// retrieve IDs for all terms
$termsArray = $this->pre_process_terms( $termsArray );
if ( empty( $termsArray ) ) {
// something went quite wrong
return false;
}
// storage in prep for bulk INSERTs
$indexTerms = $indexTermsSQL = array();
$customFieldTerms = $customFieldTermsSQL = array();
$taxonomyTerms = $taxonomyTermsSQL = array();
// insert terms into index
while ( ( $term = current( $termsArray ) ) !== false ) {
$key = trim( (string) $term['term'] );
if ( ! empty( $term ) && ! empty ( $key ) ) {
// if an ID is somehow missing, grab it
// TODO: determine if this is still (ever) an issue
if ( ! isset( $term['id'] ) ) {
/** @noinspection SqlDialectInspection */
$term['id'] = $wpdb->get_var( $wpdb->prepare( 'SELECT id FROM ' . $termsTable . ' WHERE term = %s', $key ) );
}
$termID = isset( $term['id'] ) ? absint( $term['id'] ) : 0;
// insert the counts for our standard fields
$indexTermsSQL[] = '(%d,%d,%d,%d,%d,%d,%d)';
$indexTerms = array_merge( $indexTerms, array(
$termID,
isset( $term['counts']['content'] ) ? absint( $term['counts']['content'] ) : 0,
isset( $term['counts']['title'] ) ? absint( $term['counts']['title'] ) : 0,
isset( $term['counts']['comments'] ) ? absint( $term['counts']['comments'] ) : 0,
isset( $term['counts']['excerpt'] ) ? absint( $term['counts']['excerpt'] ) : 0,
isset( $term['counts']['slug'] ) ? absint( $term['counts']['slug'] ) : 0,
absint( $this->post->ID ),
) );
// insert our custom field counts
if ( isset( $term['counts']['customfield'] ) && is_array( $term['counts']['customfield'] ) && count( $term['counts']['customfield'] ) ) {
while ( ( $customFieldCount = current( $term['counts']['customfield'] ) ) !== false ) {
$customField = key( $term['counts']['customfield'] );
$customFieldTermsSQL[] = '(%s,%d,%d,%d)';
$customFieldTerms = array_merge( $customFieldTerms, array(
$customField,
isset( $term['id'] ) ? absint( $term['id'] ) : 0,
absint( $customFieldCount ),
absint( $this->post->ID ),
) );
next( $term['counts']['customfield'] );
}
reset( $term['counts']['customfield'] );
}
// index our taxonomy counts
if ( isset( $term['counts']['taxonomy'] ) && is_array( $term['counts']['taxonomy'] ) && count( $term['counts']['taxonomy'] ) ) {
while ( ( $taxonomyCount = current( $term['counts']['taxonomy'] ) ) !== false ) {
$taxonomyName = key( $term['counts']['taxonomy'] );
$taxonomyTermsSQL[] = '(%s,%d,%d,%d)';
$taxonomyTerms = array_merge( $taxonomyTerms, array(
$taxonomyName,
isset( $term['id'] ) ? absint( $term['id'] ) : 0,
absint( $taxonomyCount ),
absint( $this->post->ID ),
) );
next( $term['counts']['taxonomy'] );
}
reset( $term['counts']['taxonomy'] );
}
}
next( $termsArray );
}
reset( $termsArray );
// INSERT index terms
if ( ! empty( $indexTerms ) ) {
$indexTable = $wpdb->prefix . SEARCHWP_DBPREFIX . 'index';
$wpdb->query(
$wpdb->prepare( "INSERT INTO {$indexTable} (term,content,title,comment,excerpt,slug,post_id) VALUES " . implode( ',', $indexTermsSQL ), $indexTerms )
);
}
// INSERT custom field terms
if ( ! empty( $customFieldTerms ) ) {
$cfTable = $wpdb->prefix . SEARCHWP_DBPREFIX . 'cf';
$wpdb->query(
$wpdb->prepare( "INSERT INTO {$cfTable} (metakey,term,count,post_id) VALUES " . implode( ',', $customFieldTermsSQL ), $customFieldTerms )
);
}
// INSERT taxonomy terms
if ( ! empty( $taxonomyTerms ) ) {
$taxTable = $wpdb->prefix . SEARCHWP_DBPREFIX . 'tax';
$wpdb->query(
$wpdb->prepare( "INSERT INTO {$taxTable} (taxonomy,term,count,post_id) VALUES " . implode( ',', $taxonomyTermsSQL ), $taxonomyTerms )
);
}
return $success;
}
/**
* Remove accents from the submitted string
*
* @param string $string The string from which to remove accents
* @return string
* @since 1.0
*/
function remove_accents( $string ) {
$original_string = $string;
$conversions = array(
'À' => 'a', 'Á' => 'a', 'Â' => 'a', 'Ã' => 'a', 'Ä' => 'a', 'Å' => 'a', 'Æ' => 'a', 'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'a',
'Ò' => 'o', 'Ó' => 'o', 'Ô' => 'o', 'Õ' => 'o', 'Ö' => 'o', 'Ø' => 'o', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ø' => 'o',
'È' => 'e', 'É' => 'e', 'Ê' => 'e', 'Ë' => 'e', 'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e', 'ð' => 'e',
'Ç' => 'c', 'ç' => 'c',
'Ð' => 'd',
'Ì' => 'i', 'Í' => 'i', 'Î' => 'i', 'Ï' => 'i', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i',
'Ù' => 'u', 'Ú' => 'u', 'Û' => 'u', 'Ü' => 'u', 'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u',
'Ñ' => 'n', 'ñ' => 'n',
'Þ' => 't',
'ß' => 's',
'ÿ' => 'y', 'ý' => 'y',
// greek
'Ά' => 'Α', 'ά' => 'α', 'Έ' => 'Ε', 'έ' => 'ε', 'Ή' => 'Η', 'ή' => 'η', 'Ί' => 'Ι', 'ί' => 'ι', 'Ό' => 'Ο', 'ό' => 'ο', 'Ύ' => 'Υ', 'ύ' => 'υ', 'Ώ' => 'Ω', 'ώ' => 'ω', 'ϊ' => 'ι', 'ϋ' => 'υ', 'Ϊ' => 'ι', 'Ϋ' => 'Υ',
);
// this spelling mistake made it to release... ugh
$conversions = apply_filters( 'searchwp_leinent_accents_conversions', $conversions );
$string = strtr(
$string,
// let developers customize the conversion table
apply_filters( 'searchwp_lenient_accents_conversions', $conversions )
);
// also a spelling mistake that made it to release
$string = apply_filters( 'searchwp_leinent_accent_result', $string, $original_string );
// let developers 'fix' an incorrect conversion
$string = apply_filters( 'searchwp_lenient_accent_result', $string, $original_string );
return $string;
}
/**
* Determine keyword weights for a given string. Our 'weights' are not traditional, but instead simple counts
* so as to facilitate changing weights on the fly and not having to reindex. Actual weights are computed at
* query time.
*
* @param string $string The string from which to obtain weights
* @return array Terms and their correlating counts
* @since 1.0
*/
function get_term_counts( $string = '' ) {
$searchwp = SWP();
$wordArray = array();
if ( is_string( $string ) && ! empty( $string ) ) {
// we need to extract whitelist matches here
$string = ' ' . $string . ' '; // we need front and back spaces so we can perform exact matches when whitelisting
// extract terms based on whitelist pattern, allowing for approved indexing of terms with punctuation
$whitelisted_terms = $searchwp->extract_terms_using_pattern_whitelist( $string );
if ( ! empty( $whitelisted_terms ) && apply_filters( 'searchwp_exclusive_regex_matches', false ) ) {
$string = SWP()->process_exclusive_regex_matches( $string, $whitelisted_terms );
}
$string_lowercase = function_exists( 'mb_strtolower' ) ? mb_strtolower( $string, 'UTF-8' ) : strtolower( $string );
$string = trim( $string_lowercase );
if ( false !== strpos( $string, ' ' ) ) {
$exploded = explode( ' ', $string );
} else {
$exploded = array( $string );
}
// append our whitelist
if ( is_array( $whitelisted_terms ) && ! empty( $whitelisted_terms ) ) {
$whitelisted_terms = array_map( 'trim', $whitelisted_terms );
$whitelisted_terms = array_filter( $whitelisted_terms, 'strlen' );
// We need to remove what was whitelisted so as to not interfere with counts.
$exploded = array_diff( $exploded, $whitelisted_terms );
if ( ! empty( $whitelisted_terms ) ) {
$exploded = array_merge( $exploded, $whitelisted_terms );
}
}
// ensure word length obeys database schema
if ( is_array( $exploded ) && ! empty( $exploded ) ) {
foreach ( $exploded as $term_key => $term_term ) {
$exploded[ $term_key ] = trim( $term_term );
if ( strlen( $term_term ) > $this->max_term_length ) {
// just drop it, it's useless anyway
unset( $exploded[ $term_key ] );
} else {
// accommodate accent-less searches (e.g. allow accented search results with non-accented search terms)
// this happens with WordPress taxonomy terms (WP strips them out)
if ( $this->lenient_accents ) {
$without_accent = $this->remove_accents( $term_term );
$without_accent = function_exists( 'mb_strtolower' ) ? mb_strtolower( $without_accent, 'UTF-8' ) : strtolower( $without_accent );
if ( $without_accent !== $term_term ) {
// "duplicate" the term with this accent-less version
$exploded[] = $without_accent;
}
}
}
}
$exploded = array_values( $exploded );
$wordArray = $this->get_word_count_from_array( $exploded );
}
}
return $wordArray;
}
/**
* Determine a word count for the submitted array.
*
* Modified version of Sphider's unique_array() by Ando Saabas, http://www.sphider.eu/
*
* @param array $arr
* @return array
* @since 1.0
*/
function get_word_count_from_array( $arr = array() ) {
$newarr = array();
// set the minimum character length to count as a valid term
$minLength = apply_filters( 'searchwp_minimum_word_length', 3 );
while ( ( $term = current( $arr ) ) !== false ) {
if ( ! in_array( $term, $this->common, true ) && ( strlen( $term ) >= absint( $minLength ) ) ) {
$key = md5( $term );
if ( ! isset( $newarr[ $key ] ) ) {
$newarr[ $key ] = array(
'term' => sanitize_text_field( $term ),
'count' => 1,
);
} else {
$newarr[ $key ]['count'] = absint( $newarr[ $key ]['count'] ) + 1;
}
}
next( $arr );
}
reset( $arr );
$newarr = array_values( $newarr );
return $newarr;
}
/**
* Retrieve only the term content from the submitted string
*
* Modified from Sphider by Ando Saabas, http://www.sphider.eu/
*
* @param string $content The source content, can include markup
* @return string The content without markup or character encoding
* @since 1.0
*/
function clean_content( $content = '', $skip_extra_processing = false ) {
$searchwp = SWP();
if ( is_array( $content ) || is_object( $content ) ) {
$content = $this->parse_variable_for_terms( $content );
}
$content = html_entity_decode( $content );
// allow developers the ability to customize content where necessary (e.g. remove TM symbols)
$content = apply_filters( 'searchwp_indexer_pre_process_content', $content );
if ( function_exists( 'mb_convert_encoding' ) ) {
$content = mb_convert_encoding( $content, 'UTF-8', 'UTF-8' );
}
$index_emoji = apply_filters( 'searchwp_index_emoji', false );
if ( empty( $searchwp->settings['utf8mb4'] ) || empty( $index_emoji ) ) {
$content = $searchwp->replace_4_byte( $content );
}
// we want to extract potentially valuable content from certain HTML attributes
$accepted_attributes = apply_filters( 'searchwp_indexer_tag_attributes', array(
'a' => array( 'title' ),
'img' => array( 'alt', 'src', 'longdesc', 'title' ),
'input' => array( 'placeholder', 'value' ),
) );
// Handle strange entities that are better suited by not strange entities.
$content = preg_replace( '~\x{00AD}~u', '-', $content ); // ­ soft hyphen => hyphen.
// parse $content as a DOMDocument and if applicable extract the accepted attribute content
$attribute_content = array();
$content = trim( $content );
if ( ! empty( $accepted_attributes )
&& ! empty( $content )
&& is_array( $accepted_attributes )
&& class_exists( 'DOMDocument' )
&& function_exists( 'libxml_use_internal_errors' )
) {
$dom = new DOMDocument();
libxml_use_internal_errors( true );
$dom->loadHTML( $content );
// loop through our accepted tags
foreach ( $accepted_attributes as $tag => $attributes ) {
// grab any $tag matches
$node_list = $dom->getElementsByTagName( $tag );
for ( $i = 0; $i < $node_list->length; $i++ ) {
$node = $node_list->item( $i );
if ( $node->hasAttributes() ) {
foreach ( $node->attributes as $attr ) {
if ( isset( $attr->name ) && in_array( $attr->name, $attributes, true ) ) {
$attribute_content[] = sanitize_text_field( $attr->nodeValue );
}
}
}
}
}
}
// append the attribute content to our main content block
if ( ! empty( $attribute_content ) ) {
$content .= ' ' . implode( ' ', $attribute_content );
}
// we need front and back spaces so we can perform exact matches when whitelisting
$content = ' ' . $content . ' '; // we need front and back spaces so we can perform exact matches when whitelisting
// extract terms based on whitelist pattern, allowing for approved indexing of terms with punctuation
$whitelisted_terms = $searchwp->extract_terms_using_pattern_whitelist( $content );
// when indexing we do not want to remove the matches; we're going to run everything through
// the regular sanitization so as to open the possibility for better partial matching (especially
// when taking into consideration the use of LIKE Terms or another extension)
// there may be times however, that the developer does in fact want matches to be exclusively kept together
if ( ! $skip_extra_processing && apply_filters( 'searchwp_exclusive_regex_matches', false ) && ! empty( $whitelisted_terms ) ) {
// add the buffer the entire string so we can whole-word replace
$content = ' ' . $content . ' ';
// also need to buffer the whitelisted terms to prevent replacement overrun
foreach ( $whitelisted_terms as $key => $val ) {
$whitelisted_terms[ $key ] = ' ' . $val . ' ';
}
// remove the matches
$content = str_ireplace( $whitelisted_terms, ' ', $content );
// remove the term buffer
$whitelisted_terms = array_map( 'trim', $whitelisted_terms );
// clean up the double space flag we used
$content = str_replace( ' ', ' ', $content );
}
// buffer tags with spaces before removing them
$content = preg_replace ( '/<[^>]*>/', ' \\0 ', $content );
$content = preg_replace( '/ /', ' ', $content );
if ( ! $skip_extra_processing ) {
$content = function_exists( 'mb_strtolower' ) ? mb_strtolower( $content, 'UTF-8' ) : strtolower( $content );
}
// <br> tags can be problematic on their own if there's no whitespace surrounding
// what should be separate lines of text, so we'll manually do that prior to stripping
$content = str_replace( array( '<br />', '<br/>', '<br>' ), ' ', $content );
// since we've extracted and appended the attribute content we can strip the tags entirely
$content = strip_tags( $content );
$content = stripslashes( $content );
// remove punctuation
$punctuation = array( '(', ')', '·', "'", '´', '’', '‘', '”', '“', '„', '—', '–', '×', '…', '€', '\n', '.', ',', '/', '\\', '|', '[', ']', '{', '}', '•', '`' );
$content = str_replace( $punctuation, ' ', $content );
$content = preg_replace( '/[[:punct:]]/uiU', ' ', $content );
$content = preg_replace( '/[[:space:]]/uiU', ' ', $content );
$content = preg_replace( '/\\n|\\R/uiU', ' ', $content );
// append our whitelist
if ( ! $skip_extra_processing && is_array( $whitelisted_terms ) && ! empty( $whitelisted_terms ) ) {
$whitelisted_terms = array_map( 'trim', $whitelisted_terms );
$whitelisted_terms = array_filter( $whitelisted_terms, 'strlen' );
$content .= ' ' . implode( ' ' , $whitelisted_terms );
}
$content = sanitize_text_field( $content );
$content = trim( $content );
return $content;
}
/**
* Get the term counts for a title
*
* @param string $title The title to index
* @return array|bool Terms and their associated counts
* @since 1.0
*/
function index_title( $title = '' ) {
if ( ! $this->is_attribute_used( 'title' ) ) {
return '';
}
$title = ( ! is_string( $title ) || empty( $title ) ) && ! empty( $this->post->post_title ) ? $this->post->post_title : $title;
$title = $this->clean_content( $title );
if ( ! empty( $title ) && is_string( $title ) ) {
return $this->get_term_counts( $title );
} else {
return false;
}
}
public function is_attribute_used( $attribute = '' ) {
$used = false;
$attributes = array( 'title', 'slug', 'content', 'excerpt' );
if ( ! in_array( $attribute, $attributes, true ) ) {
return apply_filters( 'searchwp_is_attribute_used', $used, $attribute );
}
foreach ( SWP()->settings['engines'] as $engine => $post_types ) {
foreach ( $post_types as $post_type => $post_type_settings ) {
if ( $post_type !== $this->post->post_type ) {
continue;
}
if ( ! isset( $post_type_settings['enabled'] ) || empty( $post_type_settings['enabled'] ) ) {
continue;
}
if ( empty( $post_type_settings['weights'] ) ) {
continue;
}
if ( empty( $post_type_settings['weights'][ $attribute ] ) ) {
continue;
}
$used = true;
}
if ( $used ) {
break;
}
}
$used = apply_filters( 'searchwp_is_attribute_used', $used, $attribute );
$used = ! empty( $used );
return $used;
}
/**
* Index the filename itself
*
* @param string $filename The filename to index
* @return array|bool
*/
function index_filename( $filename = '' ) {
$fullFilename = explode( '.', basename( $filename ) );
if ( isset( $fullFilename[0] ) ) {
$filename = $fullFilename[0]; // don't care about extension
}
if ( ! empty( $filename ) && is_string( $filename ) ) {
return $this->get_term_counts( $filename );
} else {
return false;
}
}
/**
* Get the term counts for a filename
*
* @param string $filename The filename to index
* @return array|bool Terms and their associated counts
* @since 1.0
* @deprecated 1.5.1
*/
function extract_filename_terms( $filename = '' ) {
// try to retrieve keywords from filename, explode by '-' or '_'
$fullFilename = explode( '.', basename( $filename ) );
if ( isset( $fullFilename[0] ) ) {
$fullFilename = $fullFilename[0]; // don't care about extension
}
// first explode by hyphen, then explode those pieces by underscore
$filenamePieces = array();
$filenameFirstPass = explode( '-', $fullFilename );
if ( count( $filenameFirstPass ) > 1 ) {
while ( ( $filenameSegment = current( $filenameFirstPass ) ) !== false ) {
$filenamePieces[] = $filenameSegment;
next( $filenameFirstPass );
}
reset( $filenameFirstPass );
} else {
$filenamePieces = array( $fullFilename );
}
while ( ( $filenamePiece = current( $filenamePieces ) ) !== false ) {
$filenameSecondPass = explode( '-', $filenamePiece );
if ( count( $filenameSecondPass ) > 1 ) {
while ( ( $filenameSegment = current( $filenameSecondPass ) ) !== false ) {
$filenamePieces[] = $filenameSegment;
next( $filenameSecondPass );
}
reset( $filenameSecondPass );
} else {
$filenamePieces[] = $filenamePiece;
}
next( $filenamePieces );
}
reset( $filenamePieces );
// if we found some pieces we'll put them back together, if not we'll use the original
$filename = is_array( $filenamePieces ) ? implode( ' ', $filenamePieces ) : $filename;
return $filename;
}
/**
* Get the term counts for a slug
*
* @param string $slug The slug to index
* @return array|bool Terms and their associated counts
* @since 1.0
*/
function index_slug( $slug = '' ) {
if ( ! $this->is_attribute_used( 'slug' ) ) {
return '';
}
$slug = ( ! is_string( $slug ) || empty( $slug ) ) && ! empty( $this->post->post_name ) ? $this->post->post_name : $slug;
$slug = str_replace( '-', ' ', $slug );
$slug = $this->clean_content( $slug );
if ( ! empty( $slug ) && is_string( $slug ) ) {
return $this->get_term_counts( $slug );
} else {
return false;
}
}
/**
* Get the term counts for a content block
*
* @param string $content The content to index
* @return array|bool Terms and their associated counts
* @since 1.0
*/
function index_content( $content = '' ) {
if ( ! $this->is_attribute_used( 'content' ) ) {
return '';
}
$content = ( ! is_string( $content ) || empty( $content ) ) && ! empty( $this->post->post_content ) ? $this->post->post_content : $content;
$content = $this->clean_content( $content );
if ( ! empty( $content ) && is_string( $content ) ) {
return $this->get_big_data_term_count( $content );
} else {
return false;
}
}
/**
* Get the term counts for a comment
*
* @return array Terms and their associated counts
* @since 1.0
*/
function index_comments() {
// TODO: short circuit on pingback/trackback?
// index comments
$comments_args = array(
'status' => 'approve',
'post_id' => $this->post->ID,
);
do_action( 'searchwp_indexer_pre_get_comments' );
$comments = get_comments( apply_filters( 'searchwp_indexer_comments_args', $comments_args ) );
$commentTerms = array();
if ( ! empty( $comments ) ) {
while ( ( $comment = current( $comments ) ) !== false ) {
$comment = apply_filters( 'searchwp_indexer_comment', $comment );
$author = isset( $comment->comment_author ) && ! empty( $comment->comment_author ) ? $comment->comment_author : null;
$email = isset( $comment->comment_author_email ) && ! empty( $comment->comment_author_email ) ? $comment->comment_author_email : null;
$comment = isset( $comment->comment_content ) && ! empty( $comment->comment_content ) ? $comment->comment_content : $comment;
$comment = $this->clean_content( $comment );
// grab all the comment data
$author = ! empty( $author ) && is_string( $author ) ? $author : '';
$email = ! empty( $email ) && is_string( $email ) ? $email : '';
$comment = ! empty( $comment ) && is_string( $comment ) ? $comment : '';
$commentTerms[] = $comment;
unset( $comment );
if ( apply_filters( 'searchwp_include_comment_author', false ) ) {
$commentTerms[] = sanitize_text_field( $author );
}
if ( apply_filters( 'searchwp_include_comment_email', false ) ) {
$commentTerms[] = sanitize_text_field( $email );
}
next( $comments );
}
reset( $comments );
}
$commentTerms = $this->get_big_data_term_count( implode( ' ', $commentTerms ) );
return $commentTerms;
}
/**
* Index the terms within a taxonomy
*
* @param null|string $taxonomy The taxonomy name
* @param array $terms The terms to index
* @return array|bool Terms and their associated counts
* @since 1.0
*/
function index_taxonomy_terms( $taxonomy = null, $terms = array() ) {
// get just the term strings
$cleanTerms = array();
if ( is_array( $terms ) && ! empty( $terms ) ) {
while ( ( $term = current( $terms ) ) !== false ) {
/** @noinspection PhpUnusedLocalVariableInspection */
$termsKey = key( $terms );
$term_string_to_index = html_entity_decode( $term->name ); // e.g. A&B is encoded as A&B (it will be sanitized later)
$context = array(
'SWP' => $this,
'taxonomy' => $taxonomy,
'term' => $term,
);
if ( apply_filters( 'searchwp_indexer_taxonomy_term_index_slug', false, $context ) ) {
$term_string_to_index .= ' ' . $term->slug;
}
$term_string_to_index = apply_filters( 'searchwp_indexer_taxonomy_term', $term_string_to_index, $context );
$term_string_to_index = $this->clean_content( $term_string_to_index );
$cleanTerms[] = $term_string_to_index;
next( $terms );
}
reset( $terms );
}
$cleanTerms = trim( implode( ' ', $cleanTerms ) );
if ( ! empty( $cleanTerms ) && is_string( $cleanTerms ) && ! empty( $taxonomy ) && is_string( $taxonomy ) ) {
return $this->get_term_counts( $cleanTerms );
} else {
return false;
}
}
/**
* Get the term counts for an excerpt
*
* @param string $excerpt The excerpt to index
* @return array|bool Terms and their associated counts
* @since 1.0
*/
function index_excerpt( $excerpt = '' ) {
if ( ! $this->is_attribute_used( 'excerpt' ) ) {
return '';
}
$excerpt = ( ! is_string( $excerpt ) || empty( $excerpt ) ) && ! empty( $this->post->post_excerpt ) ? $this->post->post_excerpt : $excerpt;
$excerpt = $this->clean_content( $excerpt );
if ( ! empty( $excerpt ) && is_string( $excerpt ) ) {
return $this->get_term_counts( $excerpt );
} else {
return false;
}
}
/**
* Extract term counts from potentially big data
*
* @since 2.8
*
* @param $string
*
* @return array
*/
function get_big_data_term_count( $string ) {
if ( $this->big_data_trigger < strlen( $string ) ) {
$counts = array();
// chunk
$parts = explode( "\n", wordwrap( $string, $this->big_data_trigger ) );
$total_parts = count( $parts );
// count terms in each chunk
for ( $i = 0; $i < $total_parts; $i++ ) {
$part_term_counts = $this->get_term_counts( $parts[ $i ] );
if ( 0 === $i ) {
// on the first pass this count chunk is it
$counts = $part_term_counts;
} else {
// we need to merge this count chunk with the counts;
// begin by looping through this count chunk
foreach ( $part_term_counts as $key => $part_term_count ) {
$term_hash = md5( $part_term_count['term'] );
if ( array_key_exists( $term_hash, $counts ) ) {
// this term was already counted, so we need to increment the count
$counts[ $term_hash ]['count'] += $part_term_count['count'];
} else {
// this term has not been counted yet, so append it
$counts[] = $part_term_count;
}
unset( $part_term_counts[ $key ] );
}
}
unset( $parts[ $i ] );
}
} else {
$counts = $this->get_term_counts( $string );
}
return array_values( $counts );
}
/**
* Index a Custom Field, no matter what format
*
* @param null $customFieldName Custom Field meta key
* @param mixed $customFieldValue Custom field value
* @return array|bool Terms and their associated counts
* @since 1.0
*/
function index_custom_field( $customFieldName = null, $customFieldValue ) {
// custom fields can be pretty much anything, so we need to make sure we're unserializing, json_decoding, etc.
$customFieldValue = $this->parse_variable_for_terms( $customFieldValue );
if ( ! empty( $customFieldName ) && is_string( $customFieldName ) && ! empty( $customFieldValue ) && is_string( $customFieldValue ) ) {
return $this->get_big_data_term_count( $customFieldValue );
} else {
return false;
}
}
/**
* Retrieve terms from any kind of variable, even serialized and json_encode()ed values
*
* Modified from pods_sanitize() written by Scott Clark for Pods http://pods.io
*
* @param mixed $input Variable from which to obtain terms
* @return string Term list
* @since 1.0
*/
function parse_variable_for_terms( $input ) {
$output = '';
// check to see if it's encoded
if ( is_string( $input ) ) {
if ( is_null( $json_decoded_input = json_decode( $input, true ) ) ) {
$input = maybe_unserialize( $input );
} else {
if ( ! is_numeric( $input ) ) {
$input = $json_decoded_input;
}
}
}
// proceed with decoded input
if ( is_string( $input ) ) {
$output = $this->clean_content( $input );
} elseif ( is_array( $input ) || is_object( $input ) ) {
foreach ( (array) $input as $key => $val ) {
$array_output = $this->parse_variable_for_terms( $val );
if ( ! is_object( $array_output ) && 'object' === gettype( $array_output ) ) {
// we hit a __PHP_Incomplete_Class Object because a serialized object was unserialized
$incomplete_class_output = '';
/** @noinspection PhpWrongForeachArgumentTypeInspection */
foreach ( $array_output as $array_output_key => $array_output_val ) {
$incomplete_class_output .= ' ' . $this->parse_variable_for_terms( $array_output_val );
}
$array_output = $incomplete_class_output;
}
$output .= ' ' . $array_output;
}
} elseif ( ! is_bool( $input ) ) {
// Make it a string
$output = (string) $input;
}
return $output;
}
// @codingStandardsIgnoreStart
/**
* @deprecated as of 2.5.7
*/
function updateRunningCounts() {
$this->update_running_counts();
}
/**
* @deprecated as of 2.5.7
*
* @param $post
*/
function setPost( $post ) {
$this->set_post( $post );
}
/**
* @deprecated as of 2.5.7
*/
function countTotalPosts() {
return $this->count_total_posts();
}
/**
* @deprecated as of 2.5.7
*/
function indexedCount() {
return $this->indexed_count();
}
/**
* @deprecated as of 2.5.7
*/
function findUnindexedPosts() {
return $this->find_unindexed_posts();
}
/**
* @deprecated as of 2.5.7
*
* @param $terms
*
* @return array
*/
function preProcessTerms( $terms ) {
return $this->pre_process_terms( $terms );
}
/**
* @deprecated as of 2.5.7
*
* @param $terms
*
* @return array
*/
function recordPostTerms( $terms ) {
return $this->record_post_terms( $terms );
}
/**
* @deprecated as of 2.5.7
*
* @param string $string
*
* @return array
*/
function getTermCounts( $string = '' ) {
return $this->get_term_counts( $string );
}
/**
* @deprecated as of 2.5.7
*
* @param $array
*
* @return array
* @internal param string $string
*
*/
function getWordCountFromArray( $array ) {
return $this->get_word_count_from_array( $array );
}
/**
* @deprecated as of 2.5.7
*
* @param $content
*
* @return array
* @internal param string $string
*
*/
function cleanContent( $content ) {
return $this->clean_content( $content );
}
/**
* @deprecated as of 2.5.7
*
* @param string $title
*
* @return array
* @internal param string $string
*
*/
function indexTitle( $title = '' ) {
return $this->index_title( $title );
}
/**
* @deprecated as of 2.5.7
*
* @param string $filename
*
* @return array
* @internal param string $string
*
*/
function indexFilename( $filename = '' ) {
return $this->index_filename( $filename );
}
/**
* @deprecated as of 2.5.7
*
* @param string $filename
*
* @return array
* @internal param string $string
*
*/
function extractFilenameTerms( $filename = '' ) {
/** @noinspection PhpDeprecationInspection */
return $this->extract_filename_terms( $filename );
}
/**
* @deprecated as of 2.5.7
*
* @param string $slug
*
* @return array
* @internal param string $filename
*
* @internal param string $string
*/
function indexSlug( $slug = '' ) {
return $this->index_slug( $slug );
}
/**
* @deprecated as of 2.5.7
*
* @param $content
*
* @return array
* @internal param string $string
*
*/
function indexContent( $content ) {
return $this->index_content( $content );
}
/**
* @deprecated as of 2.5.7
*/
function indexComments() {
return $this->index_comments();
}
/**
* @deprecated as of 2.5.7
*
* @param null $taxonomy
* @param array $terms
*
* @return array|bool
*/
function indexTaxonomyTerms( $taxonomy = null, $terms = array() ) {
return $this->index_taxonomy_terms( $taxonomy, $terms );
}
/**
* @deprecated as of 2.5.7
*
* @param string $excerpt
*
* @return array|bool
*/
function indexExcerpt( $excerpt = '' ) {
return $this->index_excerpt( $excerpt );
}
/**
* @deprecated as of 2.5.7
*
* @param null $name
* @param $value
*
* @return array|bool
*/
function indexCustomField( $name = null, $value ) {
return $this->index_custom_field( $name, $value );
}
/**
* @deprecated as of 2.5.7
*
* @param $var
*
* @return array|bool
* @internal param null $name
* @internal param $value
*
*/
function parseVariableForTerms( $var ) {
return $this->parse_variable_for_terms( $var );
}
// @codingStandardsIgnoreEnd
}