function ExtractHtmlTagValues(const HtmlText: string; TagName, AttribName: string; var Values: TStringList): integer;
function FindFirstCharAfterSpace(const Line: string; StartPos: integer): Integer; var i: integer; begin Result := -1; for i := StartPos to Length(Line) do begin if (Line[i] <> ' ') then begin Result := i; exit; end; end; end;
function FindFirstSpaceAfterChars(const Line: string; StartPos: integer): Integer; begin Result := PosEx(' ', Line, StartPos); end;
function FindFirstSpaceBeforeChars(const Line: string; StartPos: integer): Integer; var i: integer; begin Result := 1; for i := StartPos downto 1 do begin if (Line[i] = ' ') then begin Result := i; exit; end; end; end;
var InnerTag: string; LastPos, LastInnerPos: Integer; SPos, LPos, RPos: Integer; AttribValue: string; ClosingChar: char; TempAttribName: string; begin Result := 0; LastPos := 1; while (true) do begin // find outer tags '<' & '>' LPos := PosEx('<', HtmlText, LastPos); if (LPos <= 0) then break; RPos := PosEx('>', HtmlText, LPos+1); if (RPos <= 0) then LastPos := LPos + 1 else LastPos := RPos + 1;
// get inner tag InnerTag := Copy(HtmlText, LPos+1, RPos-LPos-1); InnerTag := Trim(InnerTag); // remove spaces if (Length(InnerTag) < Length(TagName)) then continue;
// check tag name if (SameText(Copy(InnerTag, 1, Length(TagName)), TagName)) then begin // found tag AttribValue := ''; LastInnerPos := Length(TagName)+1; while (LastInnerPos < Length(InnerTag)) do begin // find first '=' after LastInnerPos RPos := PosEx('=', InnerTag, LastInnerPos); if (RPos <= 0) then break;
// this way you can check for multiple attrib names and not a specific attrib SPos := FindFirstSpaceBeforeChars(InnerTag, RPos); TempAttribName := Trim(Copy(InnerTag, SPos, RPos-SPos)); if (true) then begin // found correct tag LPos := FindFirstCharAfterSpace(InnerTag, RPos+1); if (LPos <= 0) then begin LastInnerPos := RPos + 1; continue; end; LPos := FindFirstCharAfterSpace(InnerTag, LPos); // get to first char after '=' if (LPos <= 0) then continue; if ((InnerTag[LPos] <> '"') and (InnerTag[LPos] <> '''')) then begin // AttribValue is not between '"' or ''' so get it RPos := FindFirstSpaceAfterChars(InnerTag, LPos+1); if (RPos <= 0) then AttribValue := Copy(InnerTag, LPos, Length(InnerTag)-LPos+1) else AttribValue := Copy(InnerTag, LPos, RPos-LPos+1); end else begin // get url between '"' or ''' ClosingChar := InnerTag[LPos]; RPos := PosEx(ClosingChar, InnerTag, LPos+1); if (RPos <= 0) then AttribValue := Copy(InnerTag, LPos+1, Length(InnerTag)-LPos-1) else AttribValue := Copy(InnerTag, LPos+1, RPos-LPos-1) end;
if (SameText(TempAttribName, AttribName)) and (AttribValue <> '') then begin Values.Add(AttribValue); inc(Result); end; end;
if (RPos <= 0) then LastInnerPos := Length(InnerTag) else LastInnerPos := RPos+1; end; end; end; end;
用法示例: 取得页面中所有链接 var Links : TStringList; LinkFound,i : Integer; begin Links := TStringList.Create; LinkFound := ExtractHtmlTagValues(HtmlText,'A','HREF',Links); for i:=0 to LinkFound-1 do begin //Add your own codes here end; Links.Free; end;
2)表单元素值攫取函数,可以从HTML文本中按照给定的Input名称解析出其Value
function GetValByName(S, Sub: string) : string; var EleS,EleE,iPos: Integer; ELeStr,ValSt: String; St,Ct : Integer; function FindEleRange(str: string ; front : boolean; posi : integer): Integer; var i: integer; begin if Front then begin for i:=posi-1 downto 1 do if Str[i]='<' then begin Result := i; break; end; end else begin for i := posi+1 to length(Str) do if Str[i]='>' then begin Result := i; break; end; end; end; function FindEnd (str : string; posi : integer) : Integer; var i: integer; begin for i:=posi to length(str) do begin if (str[i] ='"') or (str[i] ='''') or (str[i] =' ') then begin result := i-1; break; end; end; end; begin iPos := Pos('name="'+lowercase(Sub)+'"',lowercase(S)); if iPos = 0 then iPos := Pos('name='+lowercase(Sub),lowercase(S)); if iPos = 0 then iPos := Pos('name='''+lowercase(Sub)+'''',lowercase(S)); if iPos = 0 then exit; EleS := FindEleRange(S,TRUE,iPos); EleE := FindEleRange(S,FALSE,iPos); EleStr := Copy(S,EleS,EleE-EleS+1); ValSt := 'value="'; iPos := Pos(ValSt,EleStr); if iPos = 0 then begin ValSt := 'value='''; iPos := Pos(ValSt,EleStr); end; if iPos = 0 then begin ValSt := 'value='; iPos := Pos(ValSt,EleStr); end; St := iPos+length(ValSt); Ct := FindEnd(EleStr,St)-St+1; Result := Copy(EleStr,St,Ct); end;
用法示例: 取得页面中名为 Submit 的表单项的值 var InputValue : String; begin InputValue := GetValByName(HtmlText,'Submit'); end;
3)取某两个字符串中间的字符
function getStrFromHtml(var Source: String; SbStr, bStr, eStr: String): String; var I: Integer; sbPos, bPos, ePos: Integer; S: String; begin S := Source;
Result := '' ; if SBStr <> '' then Begin sbPos := Pos(UpperCase(SbStr), UpperCase(S)); if sbPos > 0 then Delete(S, 1, sbPos - 1 + length(sbStr)) Else Exit; End;
bPos := Pos(UpperCase(bStr), UpperCase(S)); if bPos > 0 then Delete(S, 1, bPos - 1 + length(bStr)) Else Exit;
ePos := pos(UpperCase(eStr), UpperCase(S)); if ePos > 0 then Delete(S, ePos, length(S));