Code: Select all
PerlRegEx = Yes;
Output.Clear;
WebSite = Input('Site Scanner', 'Enter a URL to scan...', 'http://');
if WebSite = nothing then Terminate;
Abort = No;
DecodeURL(WebSite, [BaseHost], [HostName]);
Scan = New(Scanner);
function CheckLink();
begin
if @Abort then begin
Result = No;
Exit;
end;
DecodeURL(@Scan.Location, [Host], [HostName]);
Result = (Host = @BaseHost);
if Result then begin
// Filter out URL here...
//if @Scan.Location.Pos('&order=') > 0 then Result = False;
end;
end;
function CheckMime();
begin
Result = (@Scan.MimeType ~= 'text/html');
end;
function CheckData();
begin
rx = New(RegEx);
if @Scan.MimeType ~= 'text/html' then begin
// Begin extract page info...
PageTitle = Trim(Decode(WildGet(@Scan.Data, '<title>(.*)</title')));
rx.Data = @Scan.Data;
rx.Mask = '<meta\s+name\s*=\s*("|'')description\1\s*content=("|'')([^\2]*?)\2';
rx.Reset;
if rx.Match then PageDesc = Trim(Decode(rx.Value[3])) else PageDesc = Nothing;
rx.Mask = '<meta\s+name\s*=\s*("|'')keywords\1\s*content=("|'')([^\2]*?)\2';
rx.Reset;
if rx.Match then PageKeyword = Trim(Decode(rx.Value[3])) else PageKeyword = Nothing;
// End extract page info.
// Output info as tab delimited fields...
/*Output(
@Scan.Location +tab+
PageTitle +tab+
PageDesc +tab+
PageKeyword
);*/
Output(@Scan.Location);
// Find links within page to scan...
Page = New(Parser);
Page.Parse(@Scan.Data);
HrefTags = Page.Tags('a','href');
for each HrefTags as Tag do
@Scan.AddLink(Tag);
end;
end;
Scan.Location = WebSite;
Scan.OnValidateLink = CheckLink;
Scan.OnBeforeDownload = CheckMime;
Scan.OnAfterDownload = CheckData;
Scan.Start;
function OnStop();
begin
if not @Abort then begin
Result = False;
@Abort = True;
end;
end;