[eluser]sophistry[/eluser]
Found a few more holes and plugged them.
I also wrote a description of the regex in comments in the code below since it's getting pretty hoary!
Finally, I wrapped some test code around it and put it head to head with the old regex auto_link email detector so you can see for yourself how lax the old one was.
Code: <?php
class Test extends Controller {
function Test()
{
parent::Controller();
}
function index()
{
// test on "real" addresses
$strs = array(
"back|to=school~w0w.does+this^[email protected]",
"back{to}school-does+this^[email protected]",
'back{to}school#[email protected]',
'[email protected]',
'[email protected].',
'[email protected]..',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'h#[email protected]',
'[email protected]',
'h$[email protected]',
'h%[email protected]',
'h&[email protected]',
'h*[email protected]',
'[email protected]',
'h/[email protected]',
"h'[email protected]",
'[email protected]',
'[email protected]',
'h^[email protected]',
'[email protected]',
'h`[email protected]',
'h{[email protected]',
'h}[email protected]',
'h|[email protected]',
'[email protected]',
'h.r@[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
);
$chars_not_dot = '-a-z0-9#!$%&*+/\'=?^_`{}|~';
$email_detector_regex_NEW_array = array();
$email_detector_regex_OLD_array = array();
foreach ($strs as $str)
{
// note: regex has i switch so is caseless
// note: (?:pattern) denotes a non-capturing sub-pattern
// so we don't get lots of junk in the matches array
// 1st check the mailbox part of the address:
// it must start with one char that is not a dot,
// followed by zero or more characters (that are not
// dots) or a single dot (but no double dots - negative lookahead assertion)
// followed an @ sign that is not preceded by a dot (negative lookbehind assertion)
// domain must start with alnum [a-z0-9]followed by zero or more alnums or dashes
// followed by one more alnum then a dot (again, no double dots)
// this restricts domain names to minimum two alnums
// (standard 20080830 which may be changing in future ICANN spec)
// after minimum first two alnums and single dot
// match another alnum followed by one or more alnums, dashes or single dots (no double dots)
// and domain must close with an alnum (not dots, no dashes)
preg_match_all(";([$chars_not_dot](?:[$chars_not_dot]|[.](?![.]))*)(?<![.])@((?:[[:alnum:]])(?:[-[:alnum:]]*)(?:[[:alnum:]])(?:[.](?![.]))(?:[[:alnum:]])(?:[-[:alnum:]]|[.](?![.]))*(?:[[:alnum:]]));i", $str, $matches);
//$this->_p($str);
//$this->_p($matches);
$email_detector_regex_NEW_array[]=array($str,@$matches[0][0],@$matches[1][0],@$matches[2][0]);
}
foreach ($strs as $str)
{
// the original auto_link email matcher
preg_match_all("/([a-zA-Z0-9_\.\-]+)@([a-zA-Z0-9\-]+)\.([a-zA-Z0-9\-\.]*)/i", $str, $matches);
//$this->_p($str);
//$this->_p($matches);
$email_detector_regex_OLD_array[]=array($str,@$matches[0][0],@$matches[1][0],@$matches[2][0]);
}
//$this->_p($email_detector_regex_NEW_array);
$this->load->library('table');
$this->table->set_heading('String In', 'Full pattern', 'mailbox', 'domain');
echo '<hr>NEW regex';
echo $this->table->generate($email_detector_regex_NEW_array);
echo '<hr>OLD regex';
$this->table->clear();
$this->table->set_heading('String In', 'Full pattern', 'mailbox', 'domain');
echo $this->table->generate($email_detector_regex_OLD_array);
}
function _p($d) {echo'<pre>';print_r($d);echo'</pre>';}
}
/* End of file test. */
|