• 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Email addresses with a "+" are not parsed correctly

#11
[eluser]sophistry[/eluser]
ok, got a few more tweaks here that should be a "final" version.

fixed the "capture dot at end of pattern problem"
made it much more strict overall... (see test cases in controller below).

EDIT: replaced a-z0-9 with [:alnum:] character class
Code:
<?php

class Test extends Controller {

    function Test()
    {
        parent::Controller();
    }
        
    function index()
    {
        $chars = '-.a-zA-Z0-9#!$%&*+/\'=?^_`{}|~';
        $len = strlen($chars);
        $i=0;
        print_r($chars); echo '<br>';
        while ($i<$len)
        {
            preg_match(":[$chars]:", $chars[$i], $matches);
            $i++;
            print_r($matches[0]);
        }
        
        // test on "real" addresses
        $strs = array(
                    "back|to=school~w0w.does+this^work?@sub.email.codeigniter.com",
                    "back{to}school-does+this^work@email.codeigniter.com",
                    'back{to}school#works!hostname@email.codeigniter.com',
                    'h.r@example.com',
                    'h.r@dot-at-end.example.com.',
                    'h.r@double-dot-at-end.example.com..',
                    'h.r@sub.example.com',
                    'h.r@sub-sub.email.example.com',
                    'h-r@example.com',
                    '.hr_does_not_capture_initial_dot@example.com',
                    'h#r@example.com',
                    'h!r@example.com',
                    'h$r@example.com',
                    'h%r@example.com',
                    'h&r;@example.com',
                    'h*r@example.com',
                    'h+r@example.com',
                    'h/r@example.com',
                    "h'r@example.com",
                    'h=r@example.com',
                    'h?r@example.com',
                    'h^r@example.com',
                    'h_r@example.com',
                    'h`r@example.com',
                    'h{r@example.com',
                    'h}r@example.com',
                    'h|r@example.com',
                    'h~r@example.com',
                    'h..r@example.com',
                    'how..r@example.com',
                    'how_wild..rare@example.com',
                    'h...r@example.com',
                    'h.r@example..com',
                    'h.r@tell.me.how.could.the.pattern.not.capture.this.example..com',
                    'h.r@-example-with-inital-dash.com',
                    'h.r@hostname-masks-erroneous-domain.-dash.com',
                    'h.r@dash-at-end-of-tld.com-',
                    'h.r@dash-at-end-of-domain-.com',
                    'h.r@.dot-in-front-of-domain.com',
                    );
        $chars_not_dot = '-a-z0-9#!$%&*+/\'=?^_`{}|~';
        foreach ($strs as $s)
        {
            preg_match_all(";([$chars_not_dot](?:[$chars_not_dot]|[.](?![.]))*)@((?:[[:alnum:]])(?:[-[:alnum:]]*)(?:[[:alnum:]])(?:[.](?![.]))(?:[[:alnum:]])(?:[-[:alnum:]]|[.](?![.]))+(?:[[:alnum:]]));i", $s, $matches);
            $this->_p($s);
            $this->_p($matches);
        }
    
    }
    
    function _p($d) {echo'<pre>';print_r($d);echo'</pre>';}

}

/* End of file test.php */

please test and examine.

#12
[eluser]Pascal Kriete[/eluser]
Wow, you're on fire. I'll take a closer look later tonight - looks pretty good at first glance though.

#13
[eluser]sophistry[/eluser]
Found a few more holes and plugged them.

I also wrote a description of the regex in comments in the code below since it's getting pretty hoary!

Finally, I wrapped some test code around it and put it head to head with the old regex auto_link email detector so you can see for yourself how lax the old one was.

Code:
&lt;?php

class Test extends Controller {

    function Test()
    {
        parent::Controller();
    }
        
    function index()
    {
        // test on "real" addresses
        $strs = array(
                    "back|to=school~w0w.does+this^work?@sub.email.codeigniter.com",
                    "back{to}school-does+this^work@email.codeigniter.com",
                    'back{to}school#works!hostname@email.codeigniter.com',
                    'h.r@example.com',
                    'h.r@dot-at-end.example.com.',
                    'h.r@double-dot-at-end.example.com..',
                    'h.r@sub.example.com',
                    'h.r@sub-sub.email.example.com',
                    'h-r@example.com',
                    '.hr_does_not_capture_initial_dot@example.com',
                    'h#r@example.com',
                    'h!r@example.com',
                    'h$r@example.com',
                    'h%r@example.com',
                    'h&r@example.com',
                    'h*r@example.com',
                    'h+r@example.com',
                    'h/r@example.com',
                    "h'r@example.com",
                    'h=r@example.com',
                    'h?r@example.com',
                    'h^r@example.com',
                    'h_r@example.com',
                    'h`r@example.com',
                    'h{r@example.com',
                    'h}r@example.com',
                    'h|r@example.com',
                    'h~r@example.com',
                    'h.r@too@many-at-signs.com',
                    'h.r@dash-at-end-of-tld.com-',
                    'h..r@example.com',
                    'how..r@example.com',
                    'how_wild..rare@example.com',
                    'h...r@example.com',
                    'h@tm.se',
                    'h.r@tell.me.how.could.the.pattern.not.capture.this.example..com',
                    'h.r@example-with-double-dot-after-domain..com',
                    'h.r@anotherdoubledot..com',
                    'h.r@-inital-dash.com',
                    'h.r@--------.com',
                    'h.r@.com',
                    'h.r@....com',
                    'h.r@hostname-masks-erroneous-domain.-initial-dash.com',
                    'h.r@cannot-have-dash-at-end-of-domain-.com',
                    'h.r@.dot-not-allowed-in-front-of-domain.com',
                    'h.r.@dot-not-allowed-at-end-of-mailbox.com',
                    'h.r@tld-too-short.c',
                    'h@t.c',
                    'h@t.co',
                    'h.r@hostname.with-tld-too-short.c',
                    );
        $chars_not_dot = '-a-z0-9#!$%&*+/\'=?^_`{}|~';
        
        $email_detector_regex_NEW_array = array();
        $email_detector_regex_OLD_array = array();
        foreach ($strs as $str)
        {
            // note: regex has i switch so is caseless
            // note: (?:pattern) denotes a non-capturing sub-pattern
            // so we don't get lots of junk in the matches array
            // 1st check the mailbox part of the address:
            // it must start with one char that is not a dot,
            // followed by zero or more characters (that are not
            // dots) or a single dot (but no double dots - negative lookahead assertion)
            // followed an @ sign that is not preceded by a dot (negative lookbehind assertion)
            // domain must start with alnum [a-z0-9]followed by zero or more alnums or dashes
            // followed by one more alnum then a dot (again, no double dots)
            // this restricts domain names to minimum two alnums
            // (standard 20080830 which may be changing in future ICANN spec)
            // after minimum first two alnums and single dot
            // match another alnum followed by one or more alnums, dashes or single dots (no double dots)
            // and domain must close with an alnum (not dots, no dashes)
            preg_match_all(";([$chars_not_dot](?:[$chars_not_dot]|[.](?![.]))*)(?<![.])@((?:[[:alnum:]])(?:[-[:alnum:]]*)(?:[[:alnum:]])(?:[.](?![.]))(?:[[:alnum:]])(?:[-[:alnum:]]|[.](?![.]))*(?:[[:alnum:]]));i", $str, $matches);
            //$this->_p($str);
            //$this->_p($matches);
            $email_detector_regex_NEW_array[]=array($str,@$matches[0][0],@$matches[1][0],@$matches[2][0]);
        }
        
        foreach ($strs as $str)
        {
            // the original auto_link email matcher
            preg_match_all("/([a-zA-Z0-9_\.\-]+)@([a-zA-Z0-9\-]+)\.([a-zA-Z0-9\-\.]*)/i", $str, $matches);
            //$this->_p($str);
            //$this->_p($matches);
            $email_detector_regex_OLD_array[]=array($str,@$matches[0][0],@$matches[1][0],@$matches[2][0]);
        }
        //$this->_p($email_detector_regex_NEW_array);

        $this->load->library('table');
        $this->table->set_heading('String In', 'Full pattern', 'mailbox', 'domain');
        echo '<hr>NEW regex';
        echo $this->table->generate($email_detector_regex_NEW_array);
        echo '<hr>OLD regex';
        $this->table->clear();
        $this->table->set_heading('String In', 'Full pattern', 'mailbox', 'domain');
        echo $this->table->generate($email_detector_regex_OLD_array);
    
    }
    function _p($d) {echo'<pre>';print_r($d);echo'</pre>';}
}
/* End of file test. */

#14
[eluser]sophistry[/eluser]
I trimmed it down. Now, there are fewer faults and it is smaller. I took out the regex description and added a few more test emails. i decided that since ICANN will accept single char domains eventually, that this should too (plus it was too hard to figure out how to exclude them in regex alone). also, like the previous versions, this cannot distinguish between real TLDs and invalid ones.

NOTE: i removed the single-quote and the backtick from the char list because it was messing up the formatting.
Code:
&lt;?php
class Test extends Controller {

    function Test()
    {
        parent::Controller();
    }
        
    function index()
    {
        $strs = array(
                    "back|to=school~w0w.does+this^work?@sub.email.codeigniter.com",
                    "back{to}school-does+this^work@email.codeigniter.com",
                    'back{to}school#works!hostname@email.codeigniter.com',
                    'h.dot@example.com',
                    'h.r@dot-at-end.example.com.',
                    'h.r@double-dot-at-end.example.com..',
                    'h.r@sub.example.com',
                    'h.r@sub-sub.email.example.com',
                    'h-dash@example.com',
                    '.hr_does_not_capture_initial_dot@example.com',
                    'h#pound@example.com',
                    'h!bang@example.com',
                    'h$dollar@example.com',
                    'h%perc@example.com',
                    'h&ampersand;@example.com',
                    'h*star@example.com',
                    'h+plus@example.com',
                    'h/slash@example.com',
                    "h'squote@example.com",
                    'h=equals@example.com',
                    'h?qmark@example.com',
                    'h^caret@example.com',
                    'h_underscore@example.com',
                    'h`backtick@example.com',
                    'h{opencurly@example.com',
                    'h}closecurly@example.com',
                    'h|pipe@example.com',
                    'h~tilde@example.com',
                    'h.r@too@many-at-signs.com',
                    'h.r@too.with.dots-tootoo@many-at-signs.com',
                    'h.r@too.with.dots-@many-at-signs.com',
                    'h.r@dash-at-end-of-tld.com-',
                    'h.r@dash-dot-at-end-of-tld.com-.',
                    'h.r@dot-dash-at-end-of-tld.com.-',
                    'h..r@example.com',
                    'how..r@example.com',
                    'how_wild..rare@example.com',
                    'h...r@example.com',
                    'h@tm.se',
                    'h.r@tm.ex.com',
                    'h.r@t.m.h.c.t.p.n.c.tm.ex.com',
                    'h.r@t.ex.com',
                    'h.r@tell.me.how.could.the.pattern.not.capture.this.example..com',
                    'h.r@example-with-double-dot-after-domain..com',
                    'h.r@anotherdoubledot..com',
                    'h.r@-inital-dash.com',
                    'h.r@--------.com',
                    'h.r@.com',
                    'h.r@....com',
                    'h.r@hostname-masks-erroneous-domain.-initial-dash.com',
                    'h.r@cannot-have-dash-at-end-of-domain-.com',
                    'h.r@.dot-not-allowed-in-front-of-domain.com',
                    'h.r.@dot-not-allowed-at-end-of-mailbox.com',
                    'h.r@tld-too-short.c',
                    'h@t.c',
                    'h@t.co',
                    'domain_too_short@t.com',
                    'domain_too_short-and-has-dash@t-.com',
                    'no-at-signs.com',
                    'h.r@hostname.with-tld-too-short.c',
                    'h.r@hostname.not-a-tld',
                    'h.r@domain.invalid',
                        );
        // NOTE: add the escaped single-quote and the backtick back in for completeness
        $chars_not_dot = '-a-z0-9#!$%&*+/=?^_{}|~';
        
        $tlds = $this->_get_tld_regex();
        
        $email_detector_regex_NEW_array = array();
        $email_detector_regex_OLD_array = array();
        foreach ($strs as $str)
        {
            preg_match_all(";([$chars_not_dot](?:[$chars_not_dot]|[.](?![.]))*)(?<![.])@((?:[[:alnum:]])(?:[[:alnum:]]|[-.](?![-.]))+(?<=[.])(?:[[:alnum:]]){2,});i", $str, $matches);
            //$this->_p($str);
            //$this->_p($matches);
            $email_detector_regex_NEW_array[]=array($str,@$matches[0][0],@$matches[1][0],@$matches[2][0]);
        }
        
        foreach ($strs as $str)
        {
            // the original auto_link email matcher
            preg_match_all("/([a-zA-Z0-9_\.\-]+)@([a-zA-Z0-9\-]+)\.([a-zA-Z0-9\-\.]*)/i", $str, $matches);
            //$this->_p($str);
            //$this->_p($matches);
            $email_detector_regex_OLD_array[]=array($str,@$matches[0][0],@$matches[1][0],@$matches[2][0]);

        }
        //$this->_p($email_detector_regex_NEW_array);

        $this->load->library('table');
        $this->table->set_heading('String In', 'Full pattern', 'mailbox', 'domain');
        echo '<hr>NEW regex';
        echo $this->table->generate($email_detector_regex_NEW_array);
        echo '<hr>OLD regex';
        $this->table->clear();
        $this->table->set_heading('String In', 'Full pattern', 'mailbox', 'domain');
        echo $this->table->generate($email_detector_regex_OLD_array);
    
    }
    
    function _p($d) {echo'<pre>';print_r($d);echo'</pre>';}
}

/* End of file test.php */

#15
[eluser]sophistry[/eluser]
bump.. anyone looking at this new code submission? i think it is a marked improvement... any opinions on the test code output?

#16
[eluser]Derek Allard[/eluser]
Yup, I'm looking at it. Sorry for no response. The "+" will get added into the list of valid characters for autolinker, but not all RFC compliant emails will. But your code probably will make it in as a valid_rfc_email() function or something. And hey man, sincere thanks for this contribution!

#17
[eluser]johnwbaxter[/eluser]
In gmail there is a very handy feature using the +.

If you sign up to a website you can put jeff+websitename@gmail.com, when gmail receives it it assigns the label "websitename".

A great use for this is if say you sign up to a website and then you start getting spam, you can find out who has leaked your e-mail address as it will be going to jeff+websitename@gmail.com and you can go to them and be cross.


Digg   Delicious   Reddit   Facebook   Twitter   StumbleUpon  


  Theme © 2014 iAndrew  
Powered By MyBB, © 2002-2019 MyBB Group.