Skip to content

Instantly share code, notes, and snippets.

@aristus
Last active August 29, 2015 13:57
Show Gist options
  • Save aristus/9688925 to your computer and use it in GitHub Desktop.
Save aristus/9688925 to your computer and use it in GitHub Desktop.
--- ext/standard/string.c (revision 301280)
+++ ext/standard/string.c (working copy)
@@ -1674,6 +1674,105 @@
}
/* }}} */
+#define FASTSEARCH_MIN 50
+
+/*
+ * Implementation of "fastsearch" algorithm devised by
+ * Fredrik Lundh: http://effbot.org/zone/stringlib.htm
+ * Adapted from Python-2.7/Objects/stringlib/fastsearch.h
+ *
+ * Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python
+ * Software Foundation; All Rights Reserved.
+ * Released under the Python Software Foundation License v2
+ */
+
+/* Poor man's Bloom filter */
+
+#if SIZEOF_LONG == 4 && defined(HAVE_ZEND_LONG64)
+# define PHP_BLOOM_WIDTH 64
+#else
+# define PHP_BLOOM_WIDTH 32
+#endif
+
+#define PHP_BLOOM_ADD(mask, ch) \
+ ((mask |= (1UL << ((ch) & (PHP_BLOOM_WIDTH -1)))))
+#define PHP_BLOOM(mask, ch) \
+ ((mask & (1UL << ((ch) & (PHP_BLOOM_WIDTH -1)))))
+
+/* {{{ This is a fast search code based on the Python search method */
+static char *
+php_fastsearch(char *haystack, char *needle, int needle_len, char *end)
+{
+ char *p = haystack;
+ int haystack_len = end - haystack;
+ unsigned long mask;
+ int skip;
+ int j, mlast;
+ size_t i;
+
+ /* Special case one-byte needles & needles longer than haystack */
+ if (needle_len == 1) {
+ return (char *)memchr(p, *needle, (end-p));
+ }
+ if (needle_len > haystack_len) {
+ return NULL;
+ }
+
+ mlast = needle_len - 1;
+ skip = mlast - 1;
+ end -= needle_len;
+
+ /* Process pattern up to penultimate char. Also calculate the
+ * longest safe skip distance in lieu of a proper skip table.
+ */
+ for (i = 0; i < mlast; i++) {
+ PHP_BLOOM_ADD(mask, needle[i]);
+ if (needle[i] == needle[mlast]) {
+ skip = mlast - i - 1;
+ }
+ }
+ /* Add pattern[-1] to mask */
+ PHP_BLOOM_ADD(mask, needle[mlast]);
+
+ for (i = 0; i <= haystack_len; i++) {
+ /* Note: using mlast in the skip path slows things down on x86 */
+ if (haystack[i+needle_len-1] == needle[needle_len-1]) {
+
+ /* Candidate match, scan the rest
+ * todo: memcmp() ? */
+ for (j = 0; j < mlast; j++) {
+ if (haystack[i+j] != needle[j]) {
+ break;
+ }
+ }
+
+ if (j == mlast) {
+ return (char *)(p + i);
+ }
+
+ /* Mismatch: check if next character is part of pattern */
+ if (!PHP_BLOOM(mask, haystack[i+needle_len])) {
+ i = i + needle_len;
+ } else {
+ i = i + skip;
+ }
+ } else {
+ /* Opportunistic skip: the current char is not equal to the
+ * last pattern char. Check if *next* character is part of
+ * the pattern. If not, skip away.
+ */
+ if (!PHP_BLOOM(mask, haystack[i+needle_len])) {
+ i = i + needle_len;
+ }
+ }
+ }
+
+ /* No match. How sad. */
+ return NULL;
+}
+/* }}} */
+/* End Python Fast Search Code */
+
/* {{{ proto string strstr(string haystack, string needle[, bool part])
Finds first occurrence of a string within another */
PHP_FUNCTION(strstr)
@@ -1696,14 +1795,25 @@
RETURN_FALSE;
}
- found = php_memnstr(haystack, Z_STRVAL_P(needle), Z_STRLEN_P(needle), haystack + haystack_len);
+ if (haystack_len >= FASTSEARCH_MIN) {
+ found = php_fastsearch(haystack,
+ Z_STRVAL_P(needle),
+ Z_STRLEN_P(needle),
+ haystack + haystack_len);
+ } else {
+ found = php_memnstr(haystack,
+ Z_STRVAL_P(needle),
+ Z_STRLEN_P(needle),
+ haystack + haystack_len);
+ }
} else {
if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) {
RETURN_FALSE;
}
needle_char[1] = 0;
-
- found = php_memnstr(haystack, needle_char, 1, haystack + haystack_len);
+ found = php_memnstr(haystack,
+ needle_char, 1,
+ haystack + haystack_len);
}
if (found) {
@@ -1724,6 +1834,7 @@
/* {{{ proto int strpos(string haystack, string needle [, int offset])
Finds position of first occurrence of a string within another */
+
PHP_FUNCTION(strpos)
{
zval *needle;
@@ -1732,7 +1843,7 @@
char needle_char[2];
long offset = 0;
int haystack_len;
-
+
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sz|l", &haystack, &haystack_len, &needle, &offset) == FAILURE) {
return;
}
@@ -1748,20 +1859,26 @@
RETURN_FALSE;
}
- found = php_memnstr(haystack + offset,
+ if (haystack_len >= FASTSEARCH_MIN) {
+ found = php_fastsearch(haystack + offset,
Z_STRVAL_P(needle),
Z_STRLEN_P(needle),
haystack + haystack_len);
+ } else {
+ found = php_memnstr(haystack + offset,
+ Z_STRVAL_P(needle),
+ Z_STRLEN_P(needle),
+ haystack + haystack_len);
+ }
} else {
if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) {
RETURN_FALSE;
}
needle_char[1] = 0;
-
found = php_memnstr(haystack + offset,
needle_char,
1,
- haystack + haystack_len);
+ haystack + haystack_len);
}
if (found) {
@@ -1772,6 +1889,7 @@
}
/* }}} */
+
/* {{{ proto int stripos(string haystack, string needle [, int offset])
Finds position of first occurrence of a string within another, case insensitive */
PHP_FUNCTION(stripos)
@@ -1808,7 +1926,12 @@
needle_dup = estrndup(Z_STRVAL_P(needle), Z_STRLEN_P(needle));
php_strtolower(needle_dup, Z_STRLEN_P(needle));
- found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len);
+
+ if (haystack_len >= FASTSEARCH_MIN) {
+ found = php_fastsearch(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len);
+ } else {
+ found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len);
+ }
} else {
if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) {
efree(haystack_dup);
@@ -1816,9 +1939,10 @@
}
needle_char[0] = tolower(needle_char[0]);
needle_char[1] = '\0';
- found = php_memnstr(haystack_dup + offset,
- needle_char,
- sizeof(needle_char) - 1,
+
+ found = php_memnstr(haystack_dup + offset,
+ needle_char,
+ sizeof(needle_char) - 1,
haystack_dup + haystack_len);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment