Keyword Database

August 27, 2003 by Jerry T

For our corporate websites I've developed we usually have a small Access database setup with keywords and links to the different pages containing those keywords. the visitor enters in the keyword and the detail page displays closest matches.

The problem is maintaining the database in case of updated products or content on the pages. It's a simple way to do it however.

We recently redesigned our search feature which uses a c# search class with no database and seems to be working ok. I'm including the "black and white" version with no fancy formatting. Hope it helps!

Contents of Search.aspx:

<%@ Page Language="C#" AutoEventWireUp="false"
            Inherits="MultiThreadedWebApp.SearchEngine" Src="search.aspx.cs" %>
<script language="c#" runat="server">
             protected void search(Object sender, EventArgs e) {
                        If ( SearchWebSites( keyword.Text, urls.Text ) ) {
                                    info.Text = "Searched <font color=\"red\">" + SearchResults.Count + "</font> web page(s) ";
                                    info.Text += "on the keyword <font color=\"red\">\"" + keyword.Text + "</font>\". ";
                                    info.Text += "Total search time was <font color=\"red\">" + timeSpent + "</font>";
                                    SearchForm.Visible = false;
                                    ResultList.DataSource = SearchResults;
                                    ResultList.DataBind();
                        }
            }
 
</script>
<html>
<head>
            <title>Search Engine</title>
            <style>.BodyText { font-family: verdana; font-size: 12px; color: 333333; } </style>
</head>
<body>
<asp:label id="info" Class="BodyText"
            Text="URL of the web sites to search, one url per line."
            runat="server" /><br />
<asp:Repeater id="ResultList" runat="server">
    <HeaderTemplate>
            <table Class="BodyText" border="0" cellpadding="3" cellspacing="3">
                        <tr>
                                    <td><b>Found</b></td>
                                    <td><b>Web Page Title</b></td>
                                    <td><b>Web Page URL</b></td>
                                    <td><b>Searched Time</b></td>
                        </tr>
    </HeaderTemplate>
    <ItemTemplate>
                        <tr>
                                    <td><%# DataBinder.Eval(Container.DataItem, "instanceCount") %></td>
                                    <td><%# DataBinder.Eval(Container.DataItem, "pageTitle") %></td>
                                    <td><a href="<%# DataBinder.Eval(Container.DataItem, "pageURL") %>"><%# DataBinder.Eval(Container.DataItem, "pageURL") %></a></td>
                                    <td><%# DataBinder.Eval(Container.DataItem, "timeSpent") %></td>
                        </tr>
    </ItemTemplate>
    <FooterTemplate>
            </table>
    </FooterTemplate>
</asp:Repeater>
<form id="SearchForm" runat = "server" >
<table Class="BodyText">
            <tr>
                        <td>keyword:</td>
<td><asp:textbox Class="BodyText" Text="news" id="keyword" runat="server" /></td>
</tr>
<tr>
<td valign="top"></td>
<td><asp:textbox Class="BodyText" id="urls" rows="10" columns="30" textmode="MultiLine"
runat="server" Visible="false" Text="http://localhost"/></td> <%'change this to your URL%>
</tr>
<tr><td align="right" colspan="2">
<asp:button Class="BodyText" Text="search!" Type="submit" onclick="search" runat="server" />
</td></tr>
</table>
</form>
</body>
</html>

Contents of Search.aspx.cs

using System;
using System.IO;
using System.Net;
using System.Web;
using System.Web.UI;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
/// <summary>
/// The namespace that contains the 2 classes For the search engine.
/// </summary>
namespace MultiThreadedWebApp {
 
            /// <summary>
            /// The Class that inherits the System.Web.UI.Page class.
            /// It provides methods And properties For the ASPX page.
            /// </summary>
            Public Class SearchEngine : Page {
 
                        // Private member fields.
                        Private ArrayList _pages;
                        Private TimeSpan _timeSpent;
 
                        /// <summary>
                        /// Returns an ArrayList of WebPage objects,
                        /// which contains the search results information.
                        /// </summary>
                        Public ArrayList SearchResults {
                                    Get { return _pages; }
                        }
 
                        /// <summary>
                        /// A TimeSpan object. It lets us know how Long was the entire search.
                        /// </summary>
                        Public TimeSpan timeSpent {
                                    Get { return _timeSpent; }
                        }
 
                        /// <summary>
                        /// Start searching the web sites.
                        /// </summary>
                        /// <param name="keyword">The keyword To search for.</param>
                        /// <param name="pURLs">List of URLs, seperated by the \n character.</param>
                        /// <returns></returns>
                        Public bool SearchWebSites(String keyword, String pURLs) {
 

                                    // start the timer
                                    DateTime lStarted = DateTime.Now;
                                    _pages = New ArrayList();
                                   
                                    // Split the urls String To an Array
                                    string[] lURLs = pURLs.Split('\n');
                                    Int lIdx;
                                    WebPage wp;
 
                                    // create the Thread Array
                                    Thread[] t = New Thread[ lURLs.Length ];
 
                                    For ( lIdx = 0; lIdx < lURLs.Length; lIdx ++ ) {
                                                // create a WebPage object For Each url
                                                wp = New WebPage(keyword, lURLs[lIdx]);
                                                // add it To the _pages ArrayList
                                                _pages.Add( wp );
                                                // pass the search() method of the New WebPage object
                                                // To the ThreadStart object. Then pass the ThreadStart
                                                // object To the Thread object.
                                                t[lIdx] = New Thread( New ThreadStart( wp.search ) );
                                                // start the Thread object, which executes the search().
                                                t[lIdx].Start();
                                    }
 
                                    For ( lIdx = 0; lIdx < _pages.Count; lIdx ++ ) {
                                                // waiting For all the Threads To finish.
                                                t[lIdx].Join();
                                    }
 
                                    // stop the timer.
                                    _timeSpent = DateTime.Now.Subtract( lStarted );
                                    return true;
                        }
            }
 
            /// <summary>
            /// The Class that contains information For Each searched web page.
            /// </summary>
            Public Class WebPage {
 
                        // Private member fields.
                        Private Int _instanceCount;
                        Private String _pageURL;
                        Private String _pageTitle;
                        Private String _keyword;
                        Private TimeSpan _timeSpent;
 
                        /// <summary>
                        /// A TimeSpan object. It lets us know how Long was the page search.
                        /// </summary>
                        Public TimeSpan timeSpent {
                                    Get { return _timeSpent; }
                        }
 
                        /// <summary>
                        /// How many times the search keyword appears On the page.
                        /// </summary>
                        Public Int instanceCount {
                                    Get { return _instanceCount; }
                        }
 
                        /// <summary>
                        /// The URL of the search page
                        /// </summary>
                        Public String pageURL {
                                    Get { return _pageURL; }
                        }
                        /// <summary>
/// The title of the search page
/// </summary>
Public String pageTitle {
Get { return _pageTitle; }
}
Public WebPage() {}
/// <summary>
/// A parameterized constructor of the WebPage class.
/// </summary>
/// <param name="keyword">The keyword To search for.</param>
/// <param name="pageURL">The URL To connect to.</param>
Public WebPage(String keyword, String pageURL) {
_keyword = keyword;
_pageURL = pageURL;
}
/// <summary>
/// This method connects To the searching page, And retrieve the page content.
/// It Then passes the content To various Private methods To perform other operations.
/// </summary>
Public void search() {
// start timing it
DateTime lStarted = DateTime.Now;
// create the WebRequest
WebRequest webreq = WebRequest.Create( _pageURL );
// connect To the page, And Get its response
WebResponse webresp = webreq.GetResponse();
// wrap the response stream To a stream reader
StreamReader sr = New StreamReader( webresp.GetResponseStream(), Encoding.ASCII );
StringBuilder sb = New StringBuilder();
String line;
While ( ( line = sr.ReadLine()) != Null ) {
// append Each line the server sends, To the String builder
sb.Append(line);
}
sr.Close();
String pageCode = sb.ToString();

// Get the page title
_pageTitle = getPageTitle( pageCode );
// Get the amount of time the keyword appeared On the page
_instanceCount = countInstance( getPureContent( pageCode ) );
// stop the timer
_timeSpent = DateTime.Now.Subtract( lStarted );
}
// this method uses the regular expression To match the keyword.
// it Then count the matches To find out how many times the keyword appeared On the page.
Private Int countInstance(String str) {

String lPattern = "(" + _keyword + ")";
Int count = 0;
Regex rx = New Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled );
StringBuilder sb = New StringBuilder();
Match mt;
For ( mt = rx.Match(str); mt.Success; mt = mt.NextMatch() )
count ++ ;
return count;
}
// this method uses the regular expression To match the pattern that represent all
// String enclosed between ">" And "<". It removes all the HTML tags,
// And only returns the HTML decoded content string.
Private String getPureContent(String str) {
String lPattern = ">(?:(?<c>[^<]+))";
Regex rx = New Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled );
StringBuilder sb = New StringBuilder();
Match mt;
For ( mt = rx.Match(str); mt.Success; mt = mt.NextMatch() ) {
sb.Append( HttpUtility.HtmlDecode( mt.Groups["c"].ToString() ) );
sb.Append( " " );
}
return sb.ToString();
}
// this method uses the regular expression To match the pattern that represent the
// HTML Title tag of the page. It only returns the first match, And ignores the rest.
Private String getPageTitle(String str) {
String lTitle = "";
String lPattern = "(?:<\\s*title\\s*>(?<t>[^<]+))";
Regex rx = New Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled );
Match mt = rx.Match(str);
If ( mt.Success )
try{
lTitle = mt.Groups["t"].Value.ToString();
} catch {
lTitle = "";
}
Else
lTitle = "";
return lTitle;
}
}
}

Have You Considered Atomz

August 29, 2003 by Neil Boreland
Adding the free atomz search to your site will allow you to index all your HTML, text and asp documents - its pretty easy to use and quite effective for people with limited experience