HTML Parser PHP Tutorial
HTML Parser PHP Tutorial
php
<?php
require('htmlparser.inc');
class htmlTabletoDb
{
function ParseTable($Table)
{
$_var='';
$htmlText = $Table;
$parser = new HtmlParser ($htmlText);
while ($parser->parse()) {
if(strtolower($parser->iNodeName)=='table')
{
if($parser->iNodeType == NODE_TYPE_ENDELEMENT)
$_var .='/::';
else
$_var .='::';
}
if(strtolower($parser->iNodeName)=='tr')
{
if($parser->iNodeType == NODE_TYPE_ENDELEMENT)
$_var .='!-:'; //opening row
else
$_var .=':-!'; //closing row
}
if(strtolower($parser->iNodeName)=='td' && $parser->iNodeType == NODE_TYPE_ENDELEMENT)
{
$_var .='#,#';
}
if ($parser->iNodeName=='Text' && isset($parser->iNodeValue))
{
$_var .= $parser->iNodeValue;
}
}
$elems = split(':-!',str_replace('/','',str_replace('::','',str_replace('!-:','',$_var)))); //opening row
foreach($elems as $key=>$value)
{
if(trim($value)!='')
{
$elems2 = split('#,#',$value);
array_pop($elems2);
$data[] = $elems2;
}
}
return $data;
}
}
?>
Htmlparser.inc
<?php
/*
* Copyright (c) 2003 Jose Solorzano. All rights reserved.
* Redistribution of source must retain this copyright notice.
*
* Jose Solorzano (http://jexpert.us) is a software consultant.
*
* Contributions by:
* - Leo West (performance improvements)
*/
define ("NODE_TYPE_START",0);
define ("NODE_TYPE_ELEMENT",1);
define ("NODE_TYPE_ENDELEMENT",2);
define ("NODE_TYPE_TEXT",3);
define ("NODE_TYPE_COMMENT",4);
define ("NODE_TYPE_DONE",5);
/**
* Class HtmlParser.
* To use, create an instance of the class passing
* HTML text. Then invoke parse() until it's false.
* When parse() returns true, $iNodeType, $iNodeName
* $iNodeValue and $iNodeAttributes are updated.
*
* To create an HtmlParser instance you may also
* use convenience functions HtmlParser_ForFile
* and HtmlParser_ForURL.
*/
class HtmlParser {
/**
* Field iNodeType.
* May be one of the NODE_TYPE_* constants above.
*/
var $iNodeType;
/**
* Field iNodeName.
* For elements, it's the name of the element.
*/
var $iNodeName = "";
/**
* Field iNodeValue.
* For text nodes, it's the text.
*/
var $iNodeValue = "";
/**
* Field iNodeAttributes.
* A string-indexed array containing attribute values
* of the current node. Indexes are always lowercase.
*/
var $iNodeAttributes;
// The following fields should be
// considered private:
var $iHtmlText;
var $iHtmlTextLength;
var $iHtmlTextIndex = 0;
var $iHtmlCurrentChar;
var $BOE_ARRAY;
var $B_ARRAY;
var $BOS_ARRAY;
/**
* Constructor.
* Constructs an HtmlParser instance with
* the HTML text given.
*/
function HtmlParser ($aHtmlText) {
$this->iHtmlText = $aHtmlText;
$this->iHtmlTextLength = strlen($aHtmlText);
$this->iNodeAttributes = array();
$this->setTextIndex (0);
$this->BOE_ARRAY = array (" ", "\t", "\r", "\n", "=" );
$this->B_ARRAY = array (" ", "\t", "\r", "\n" );
$this->BOS_ARRAY = array (" ", "\t", "\r", "\n", "/" );
}
/**
* Method parse.
* Parses the next node. Returns false only if
* the end of the HTML text has been reached.
* Updates values of iNode* fields.
*/
function parse() {
$text = $this->skipToElement();
if ($text != "") {
$this->iNodeType = NODE_TYPE_TEXT;
$this->iNodeName = "Text";
$this->iNodeValue = $text;
return true;
}
return $this->readTag();
}
function clearAttributes() {
$this->iNodeAttributes = array();
}
function readTag() {
if ($this->iCurrentChar != "<") {
$this->iNodeType = NODE_TYPE_DONE;
return false;
}
$this->clearAttributes();
$this->skipMaxInTag ("<", 1);
if ($this->iCurrentChar == '/') {
$this->moveNext();
$name = $this->skipToBlanksInTag();
$this->iNodeType = NODE_TYPE_ENDELEMENT;
$this->iNodeName = $name;
$this->iNodeValue = "";
$this->skipEndOfTag();
return true;
}
$name = $this->skipToBlanksOrSlashInTag();
if (!$this->isValidTagIdentifier ($name)) {
$comment = false;
if (strpos($name, "!--") === 0) {
$ppos = strpos($name, "--", 3);
if (strpos($name, "--", 3) === (strlen($name) - 2)) {
$this->iNodeType = NODE_TYPE_COMMENT;
$this->iNodeName = "Comment";
$this->iNodeValue = "<" . $name . ">";
$comment = true;
}
else {
$rest = $this->skipToStringInTag ("-->");
if ($rest != "") {
$this->iNodeType = NODE_TYPE_COMMENT;
$this->iNodeName = "Comment";
$this->iNodeValue = "<" . $name . $rest;
$comment = true;
// Already skipped end of tag
return true;
}
}
}
if (!$comment) {
$this->iNodeType = NODE_TYPE_TEXT;
$this->iNodeName = "Text";
$this->iNodeValue = "<" . $name;
return true;
}
}
else {
$this->iNodeType = NODE_TYPE_ELEMENT;
$this->iNodeValue = "";
$this->iNodeName = $name;
while ($this->skipBlanksInTag()) {
$attrName = $this->skipToBlanksOrEqualsInTag();
if ($attrName != "" && $attrName != "/") {
$this->skipBlanksInTag();
if ($this->iCurrentChar == "=") {
$this->skipEqualsInTag();
$this->skipBlanksInTag();
$value = $this->readValueInTag();
$this->iNodeAttributes[strtolower($attrName)] = $value;
}
else {
$this->iNodeAttributes[strtolower($attrName)] = "";
}
}
}
}
$this->skipEndOfTag();
return true;
}
function isValidTagIdentifier ($name) {
return ereg ("^[A-Za-z0-9_\\-]+$", $name);
}
function skipBlanksInTag() {
return "" != ($this->skipInTag ($this->B_ARRAY));
}
function skipToBlanksOrEqualsInTag() {
return $this->skipToInTag ($this->BOE_ARRAY);
}
function skipToBlanksInTag() {
return $this->skipToInTag ($this->B_ARRAY);
}
function skipToBlanksOrSlashInTag() {
return $this->skipToInTag ($this->BOS_ARRAY);
}
function skipEqualsInTag() {
return $this->skipMaxInTag ("=", 1);
}
function readValueInTag() {
$ch = $this->iCurrentChar;
$value = "";
if ($ch == "\"") {
$this->skipMaxInTag ("\"", 1);
$value = $this->skipToInTag ("\"");
$this->skipMaxInTag ("\"", 1);
}
else if ($ch == "'") {
$this->skipMaxInTag ("'", 1);
$value = $this->skipToInTag ("'");
$this->skipMaxInTag ("'", 1);
}
else {
$value = $this->skipToBlanksInTag();
}
return $value;
}
function setTextIndex ($index) {
$this->iHtmlTextIndex = $index;
if ($index >= $this->iHtmlTextLength) {
$this->iCurrentChar = -1;
}
else {
$this->iCurrentChar = $this->iHtmlText{$index};
}
}
function moveNext() {
if ($this->iHtmlTextIndex < $this->iHtmlTextLength) {
$this->setTextIndex ($this->iHtmlTextIndex + 1);
return true;
}
else {
return false;
}
}
function skipEndOfTag() {
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == ">") {
$this->moveNext();
return;
}
$this->moveNext();
}
}
function skipInTag ($chars) {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == ">") {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
}
return $sb;
}
function skipMaxInTag ($chars, $maxChars) {
$sb = "";
$count = 0;
while (($ch = $this->iCurrentChar) !== -1 && $count++ < $maxChars) {
if ($ch == ">") {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
}
return $sb;
}
function skipToInTag ($chars) {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
$match = $ch == ">";
if (!$match) {
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
}
if ($match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
return $sb;
}
function skipToElement() {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == "<") {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
return $sb;
}
/**
* Returns text between current position and $needle,
* inclusive, or "" if not found. The current index is moved to a point
* after the location of $needle, or not moved at all
* if nothing is found.
*/
function skipToStringInTag ($needle) {
$pos = strpos ($this->iHtmlText, $needle, $this->iHtmlTextIndex);
if ($pos === false) {
return "";
}
$top = $pos + strlen($needle);
$retvalue = substr ($this->iHtmlText, $this->iHtmlTextIndex, $top - $this->iHtmlTextIndex);
$this->setTextIndex ($top);
return $retvalue;
}
}
function HtmlParser_ForFile ($fileName) {
return HtmlParser_ForURL($fileName);
}
function HtmlParser_ForURL ($url) {
$fp = fopen ($url, "r");
$content = "";
while (true) {
$data = fread ($fp, 8192);
if (strlen($data) == 0) {
break;
}
$content .= $data;
}
fclose ($fp);
return new HtmlParser ($content);
}
php?>
index.php
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>HTML <Table> To Database</title>
<script language="javascript">
function insertsample(form)
{
form.htmltable.value =
"<TABLE>\r<TR><TD>S.No.</TD><TD>Name</TD><TD>Age</TD><TD>Sex</TD><TD>Location</TD></TR>\r<TR><TD>1</TD><TD
>Azeem</TD><TD>24</TD><TD>Male</TD><TD>Pakistan</TD></TR>\r<TR><TD>2</TD><TD>Khurram</TD><TD>24</TD><TD>
Male</TD><TD>Pakistan</TD></TR>\r<TR><TD>3</TD><TD>Mushhad</TD><TD>24</TD><TD>Male</TD><TD>Pakistan</TD></T
R>\r<TR><TD>3</TD><TD>Qamar</TD><TD>24</TD><TD>Male</TD><TD>Pakistan</TD></TR>\r</TABLE>";
}
function validate(form)
{
if(frmhtmltabledb.htmltable.value==''){
alert('Please enter a HTML Table or Click "Fill Sample Text"');
return false;
}
}
</script>
</head>
<body>
<form name="frmhtmltabledb" action="mapcolumns.php" method="post" onSubmit="return validate(this.form);">
<table cellpadding="3" cellspacing="0" align="center" width="75%" bgcolor="#CCCCCC">
<tr>
<td colspan="2">HTML <Table> Here</td>
</tr>
<tr>
<td><textarea cols="75" rows="25" name="htmltable"></textarea></td>
<td valign="top"><input type="submit" value="Next >>"><br><input type="button" value="Fill Sample Text"
onClick="javascript:insertsample(this.form);"></td>
</tr>
</table>
</form>
</body>
</html>
Mapcolumns.php
<?php
require('htmltabletodb.class.php');
?>
<html>
<head>
<title>HTML <Table> To Database</title>
<script language="javascript">
function includeColumn(form,checkbox)
{
if(checkbox.checked == false){
form.elements[checkbox.id-1].disabled = true;
form.elements[checkbox.id-1].value ='';
}else{
form.elements[checkbox.id-1].disabled = false;
}
}
function insertsample(form)
{
form.dbTable.value='user_info';
form.elements[1].value = "user_id";
form.elements[3].value = "user_name";
form.elements[5].value = "user_age";
form.elements[7].value = "user_sex";
form.elements[9].value = "user_location";
//form.htmltable.value =
"<TABLE>\r<TR><TD>S.No.</TD><TD>Name</TD><TD>Age</TD><TD>Sex</TD><TD>Location</TD></TR>\r<TR><TD>1</TD><TD
>Azeem</TD><TD>24</TD><TD>Male</TD><TD>Pakistan</TD></TR>\r<TR><TD>2</TD><TD>Atiq</TD><TD>24</TD><TD>Male<
/TD><TD>Pakistan</TD></TR>\r<TR><TD>3</TD><TD>Shahid</TD><TD>24</TD><TD>Male</TD><TD>Pakistan</TD></TR>\r</T
ABLE>";
}
function validate()
{
var i,isFilled = false;
for(i=1;i<(frmhtmltabletodb.elements.length-4);i+=2)
{
if(frmhtmltabletodb.elements[i].value!=''){
i = frmhtmltabletodb.elements.length;
isFilled = true;
}
}
if(frmhtmltabletodb.dbTable.value==''){
alert('Please Enter Table Name');
frmhtmltabletodb.dbTable.setfocus;
return false;
}else if(isFilled==false){
alert('Please Enter atleast one Database Table Field');
frmhtmltabletodb.elements[1].setfocus;
return false;
}
return true;
}
</script>
</head>
<body>
<?php
$objClass = new htmlTabletoDb();
$html = $_POST["htmltable"];
$totalColumns = 0;
$start = strpos(strtolower($html),'</tr');
$columns = substr($html,0,$start+5)."</table>";
$columns = substr_replace($columns,"<table cellspacing=0 width='50%' align='center' ",0,7);
$columns = str_replace("<td","<td Style=\"border:1px solid #000;\" align=\"center\"",strtolower($columns));
$columns = str_replace("<TD","<TD Style=\"border:1px solid #000;\" align=\"center\"",strtolower($columns));
$columns = str_replace("<Td","<Td Style=\"border:1px solid #000;\" align=\"center\"",strtolower($columns));
$arr_columns = $objClass->ParseTable($columns);
?>
<form name="frmhtmltabletodb" action="parse.php" method="post" onSubmit="return validate();">
<table cellpadding="0" cellspacing="0" width="75%" align="center" bgcolor="#CCCCCC">
<tr>
<td>Database Table Name</td>
</tr>
<tr>
<td><input type="text" name="dbTable" size="40"></td>
</tr>
</table><br>
<?php
foreach($arr_columns as $key =>$value)
{
echo "<table cellspacing=0 width='75%' align='center' bgcolor=\"#CCCCCC\"><tr><td colspan='3'>Column(s)
Name</td></tr><tr><td width=\"80%\"><table cellspacing=0 cellpadding='3' width='100%' align='center'><tr Style='backgroundcolor:#999999;'><td>#</td><td>HTML Table Column Name</td><td>Database Table Column Name</td><td>Insert</td>";
foreach($arr_columns[$key] as $innerkey=>$innervalue)
{
echo "<tr><td width=\"5%\" Style=\"border:1px solid #000;\">".($innerkey+1)."</td><td
Style=\"border:1px solid #000;\" >".$arr_columns[$key][$innerkey]."</td><td Style=\"border:1px solid #000;\" align=\"center\"
width=\"20%\"><input type=\"text\" Style=\"border:1px solid #000;\" size=20 name=\"column[]\"></td><td Style=\"border:1px
solid #000;\" align=\"center\" width=\"10%\"><input type=\"checkbox\" id=\"".($totalColumns+2)."\"
name=\"chk".$totalColumns."\" onClick=\"javascript:includeColumn(this.form,this);\" CHECKED></td></tr>";
$totalColumns+=2;
}
echo "</table></td><td valign=\"top\" width=\"20%\"><input type=\"submit\" value=\"Next >>\"><br><input
type=\"button\" value=\"Fill Sample Columns\" onClick=\"javascript:insertsample(this.form);\"><br><br><input type=\"hidden\"
name=\"execute\"></td></tr></table>";
}
?>
<BR>
<table cellpadding="0" cellspacing="0" width="75%" align="center" bgcolor="#CCCCCC">
<tr>
<td>Original Data</td>
</tr>
<tr>
<td align="center"><textarea cols="80" rows="25" name="htmltable"><?=$html?></textarea></td>
</tr>
</table>
</form>
</body>
</html>
Parse.php
<?php
require('htmltabletodb.class.php');
?>
<html>
<head>
<title>HTML <Table> To Database</title>
<script language="javascript">
function selectAll(theField) {
var tempval=eval("document."+theField)
tempval.focus()
tempval.select()
}
function copy_clip(meintext)
{
if (window.clipboardData)
{
// the IE-manier
window.clipboardData.setData("Text", meintext);
// waarschijnlijk niet de beste manier om Moz/NS te detecteren;
// het is mij echter onbekend vanaf welke versie dit precies werkt:
}
else if (window.netscape)
{
// dit is belangrijk maar staat nergens duidelijk vermeld:
// you have to sign the code to enable this, or see notes below
netscape.security.PrivilegeManager.enablePrivilege('UniversalXPConnect');
// maak een interface naar het clipboard
var clip = Components.classes['@mozilla.org/widget/clipboard;1'].createInstance(Components.interfaces.nsIClipboard);
if (!clip) return;
// maak een transferable
var trans = Components.classes['@mozilla.org/widget/transferable;1'].createInstance(Components.interfaces.nsITransferable);
if (!trans) return;
// specificeer wat voor soort data we op willen halen; text in dit geval
trans.addDataFlavor('text/unicode');
// om de data uit de transferable te halen hebben we 2 nieuwe objecten nodig om het in op te slaan
var str = new Object();
var len = new Object();
var str = Components.classes["@mozilla.org/supports-string;1"].createInstance(Components.interfaces.nsISupportsString);
var copytext=meintext;
str.data=copytext;
trans.setTransferData("text/unicode",str,copytext.length*2);
var clipid=Components.interfaces.nsIClipboard;
if (!clip) return false;
clip.setData(trans,null,clipid.kGlobalClipboard);
}
return false;
}
</script>
</head>
<body>
<?php
$objClass = new htmlTabletoDb();
$html = $_POST["htmltable"];
$tableName = $_POST["dbTable"];
$arr_columnsName = $_POST["column"];
$totalRows = 0;
$totalColumns = 0;
$columnsName = '';
$columnsData = '';
foreach($arr_columnsName as $cKey => $cValue)
{
if($arr_columnsName[$cKey]!='' && $columnsName=='')
$columnsName .= $arr_columnsName[$cKey];
elseif($arr_columnsName[$cKey]!='')
$columnsName .= ",".$arr_columnsName[$cKey];
}
$arr_data = $objClass->ParseTable($html);
foreach($arr_data as $outerKey => $outerValue)
{
$sql .= "\n\rINSERT INTO ".$tableName."(".$columnsName.") \nVALUES(";
foreach($arr_data[$outerKey] as $innerKey => $innerValue)
{
if($arr_columnsName[$innerKey]!=''){
if($columnsData =='')
$columnsData .="'".trim($arr_data[$outerKey][$innerKey])."'";
else
$columnsData .=",'".trim($arr_data[$outerKey][$innerKey])."'";
}
}
$sql .= $columnsData.");";
$columnsData = '';
}
?>
<form name="test">
<table cellpadding="0" cellspacing="0" width="75%" align="center" >
<tr>
<td>SQL Quries</td>
<td align="right"><a href="index.php">Home</a> :: <a
href="javascript:selectAll('test.select2')">Select All</a> :: <input type="button" value="Copy"
onClick="return copy_clip(test.select2.value);"></td>
</tr>
<tr>
<td colspan="2"><textarea cols="100" rows="25" name="select2" ><?=$sql?></textarea></td>
</tr>
</table>
</form>
</body>
</html>