PHP采集程序中常用的函數
//獲得當前的腳本網址function get_php_url(){ if(!empty($_SERVER['REQUEST_URI'])){ $scriptName = $_SERVER['REQUEST_URI']; $nowurl = $scriptName; }else{ $scriptName = $_SERVER['PHP_SELF']; if(empty($_SERVER['QUERY_STRING'])) $nowurl = $scriptName; else $nowurl = $scriptName.'?'.$_SERVER['QUERY_STRING']; } return $nowurl;}//把全角數字轉為半角數字function GetAlabNum($fnum){ $nums = array('0','1','2','3','4','5','6','7','8','9'); $fnums = '0123456789'; for($i=0;$i<=9;$i++) $fnum = str_replace($nums[$i],$fnums[$i],$fnum); $fnum = ereg_replace('[^0-9.]|^0{1,}','',$fnum); if($fnum=='') $fnum=0; return $fnum;}//去除HTML標記function Text2Html($txt){ $txt = str_replace('; ',' ',$txt); $txt = str_replace('<','<',$txt); $txt = str_replace('>','>',$txt); $txt = preg_replace('/[rn]{1,}/isU','<br/>rn',$txt); return $txt;}
//清除HTML標記function ClearHtml($str){ $str = str_replace('<','<',$str); $str = str_replace('>','>',$str); return $str;}//相對路徑轉化成絕對路徑function relative_to_absolute($content, $feed_url) { preg_match('/(http|https|ftp):///', $feed_url, $protocol); $server_url = preg_replace('/(http|https|ftp|news):///', '', $feed_url); $server_url = preg_replace('//.*/', '', $server_url);
if ($server_url == '') { return $content; }
if (isset($protocol[0])) { $new_content = preg_replace('/href='https://', 'href='http://www.cgvv.com.cn/bcjs/21944.html'.$protocol[0].$server_url.'/', $content); $new_content = preg_replace('/src='https://', 'src='http://www.cgvv.com.cn/bcjs/21944.html'.$protocol[0].$server_url.'/', $new_content); } else { $new_content = $content; } return $new_content;}//取得所有鏈接function get_all_url($code){ preg_match_all('/<as+href=['|']?([^>'' ]+)['|']?s*[^>]*>([^>]+)</a>/i',$code,$arr); return array('name'=>$arr[2],'url'=>$arr[1]);}
//獲取指定標記中的內容function get_tag_data($str, $start, $end){ if ( $start == '' || $end == '' ){ return; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0];}//HTML表格的每行轉為CSV格式數組function get_tr_array($table) { $table = preg_replace(''<td[^>]*?>'si',''',$table); $table = str_replace('</td>','',',$table); $table = str_replace('</tr>','{tr}',$table); //去掉 HTML 標記 $table = preg_replace(''<[/!]*?[^<>]*?>'si','',$table); //去掉空白字符 $table = preg_replace(''([rn])[s]+'','',$table); $table = str_replace(' ','',$table); $table = str_replace(' ','',$table);
$table = explode(',{tr}',$table); array_pop($table); return $table;}
//將HTML表格的每行每列轉為數組,采集表格數據function get_td_array($table) { $table = preg_replace(''<table[^>]*?>'si','',$table); $table = preg_replace(''<tr[^>]*?>'si','',$table); $table = preg_replace(''<td[^>]*?>'si','',$table); $table = str_replace('</tr>','{tr}',$table); $table = str_replace('</td>','{td}',$table); //去掉 HTML 標記 $table = preg_replace(''<[/!]*?[^<>]*?>'si','',$table); //去掉空白字符 $table = preg_replace(''([rn])[s]+'','',$table); $table = str_replace(' ','',$table); $table = str_replace(' ','',$table); $table = explode('{tr}', $table); array_pop($table); foreach ($table as $key=>$tr) { $td = explode('{td}', $tr); array_pop($td); $td_array[] = $td; } return $td_array;}
//返回字符串中的所有單詞 $distinct=true 去除重復function split_en_str($str,$distinct=true) { preg_match_all('/([a-zA-Z]+)/',$str,$match); if ($distinct == true) { $match[1] = array_unique($match[1]); } sort($match[1]); return $match[1];}
