Steven
Hansen
5776-22222
Table of Contents
Data
Structures................................................................................................................... 3
Data structure for tracking unprocessed pages......................................................................................................... 3
Data structure for tracking already processed pages............................................................................................ 3
Data structure for managing stop words....................................................................................................................... 3
Data structure for storing word/page index................................................................................................................ 3
Data structure for implementing robot filter.............................................................................................................. 3
Class Responsibilities.......................................................................................................... 4
Drive the overall crawling process.................................................................................................................................... 4
WebCrawler.h.............................................................................................................................................................................. 4
Store the URL and description for a page........................................................................................................................ 6
Page.h............................................................................................................................................................................................... 6
URL.h................................................................................................................................................................................................ 8
Store index that maps words to pages........................................................................................................................... 11
WordIndex.h............................................................................................................................................................................. 11
Word.h.......................................................................................................................................................................................... 12
Keep track of yet-to-be indexed pages........................................................................................................................... 15
PageQueue.h............................................................................................................................................................................. 15
Keep track of already indexed pages............................................................................................................................... 19
PageSet.h..................................................................................................................................................................................... 19
PageStatus.h............................................................................................................................................................................. 21
Load and store stop words................................................................................................................................................... 22
StopWords.h.............................................................................................................................................................................. 22
Robot Filter..................................................................................................................................................................................... 24
RobotFilter.h............................................................................................................................................................................ 24
Distinguish between HTML and non-HTML links................................................................................................. 26
HTMLFileFilter.h.................................................................................................................................................................... 26
Distinguish between in-scope and out-of-scope links........................................................................................ 28
URLScopeFilter.h................................................................................................................................................................... 28
Resolve relative URLs.............................................................................................................................................................. 30
URLResolver.h.......................................................................................................................................................................... 30
Parses words, links, and description from HTML pages................................................................................... 32
XMLParser.h.............................................................................................................................................................................. 32
Populate word index................................................................................................................................................................. 35
WordParser.h............................................................................................................................................................................ 35
Generate XML output file........................................................................................................................................................ 37
OuputGenerator.h.................................................................................................................................................................. 37
Algorithms......................................................................................................................... 39
Main driver for the crawling process............................................................................................................................ 39
HTML parser.................................................................................................................................................................................. 39
XML output page generation................................................................................................................................................ 39
Detailed description of the following data structures, including justification for each choice
In my PageQueue I will implement my LinkedList. All this data structure needs to do is
get the front Page off of the list and return it. It also needs to append Pages to the other end of the list
as well. No searching over this
data structure should be needed.
The PageHistory will definitely be a BST. The program will have to continuously add pages to the history (no remove method will be required), and then it will need to have a method to query the tree to see if a Page has already been hit. This will occur very often in the program so a data structure for fast searching will be a requirement.
For the stop words I will probably just use an array with a search method that takes advantage of the fact that the stop words should be given to us in ascending order. The BST as I have already implemented it would not be adequate, as it would just turn into a linkedlist since the words would be inserted in order.
The WordIndex, which will include a map to the pages, will need to be a BST. The WordIndex will need add, iterate, and find methods that will all need to work optimally to search over the tree. The word class will include a PageSet object that will be a BST of Pages the word appeared on. That PageSet list will also need to have a find method and a iterator. As a set it will obviously need to be unique entries.
As the robots.txt files are usually decently short, I will keep my RobotFilter class with the same data structure with which I originally built it. Just a short array of URL objects that the IsUrlPathExluded method can just iterate over as an array should be fast enough for my needs.
/*
*
WebCrawler.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef WEBCRAWLER_H
#define
WEBCRAWLER_H
#include <string>
#include <iostream>
using namespace std;
#include "WordIndex.h"
#include "PageHistory.h"
#include "PageQueue.h"
#include "OutputGenerator.h"
/*
* Drive the overall crawling process
*
* This will be the main class in the
webcrawler program.
* It will contain the main loop that
will keep the
* program running over the entire scope
of the crawl.
*
* A web crawler is an automated program
that traverses the world wide web by
* retrieving linked pages while
collecting certain information. For this
* project you will implement a simple
web crawler that will traverse a site
* specified by a given URL (Uniform
Resource Locator), collecting all of the
* textual words, the page they were
found on, and a short description of that
* page. It will output the collected
data into a specified format which may
* then be used to generate an index of
the site.
*
*/
class WebCrawler {
private:
// Tree of all indexed words
WordIndex * words;
// Tree of all indexed pages
PageHistory * pages;
// List of pages yet to be indexed
PageQueue * queue;
public:
/*
* No-argument constructor setup any
default values
* the private objects need to be
initialized ( call Init() )
*/
WebCrawler();
/*
* Destructure that will initiate the
destruction
* of basically the entire program
*/
~WebCrawler();
/*
* Load Stop words takes the filename and
loads the stop words
* this will pretty much just pass the
filename to the
* WordIndex object so it can be added to
the stop words list
*
* Parameters:
* filename - the filename of the stopwords file
to load
*
*/
void
loadStopWords(string & filename);
/*
* Main crawl loop function
* Select a page that has not yet been
indexed (the first will be the start url)
* Download the selected page
* For all text areas in the page, parse
out all of the words.
*
* For all <A> tags with href
attributes and all <FRAME> and <IFRAME>
* tags with src attributes, add the URLs
in the href or src attributes
* to the set of pages that still need to
be indexed
* (but only if they're HTML files with
the same prefix as the start URL)
*
* Save summary information for the page
*
* Repeat until there are no pages left
to index
*
* Parameters:
* start -
The first url, the start location for the crawl
*
*/
void
crawl(string & start);
/*
* This method saves the program output
to the give filename/location
* This method will not produce anything
interesting if called before
* crawl(). This method will use the
OutputGenerator class
*
* Parameters:
* filename - the filename/location of the
generated output file
*
*/
void
outputfile(string & filename);
private:
/*
* Initialization function for the
internal private objects
*/
void Init();
};
#endif
/*
*
Page.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef PAGE_H
#define PAGE_H
#include <string>
#include "URL.h"
#include "PageStatus.h"
#include "XMLParser.h"
/*
* Page represents a webpage
* Store the URL and description for a
page
*
* Also needs to handle exceptions when a
page unable to download
*
*/
class Page {
private:
// url object that holds and operates with the urls
URL * url;
// just a string desription of the page
string description;
// variable to keep track of the status of the page
PageStatus status;
public:
/*
* No-argument constructor setup any
default values
* allocate space for a URL
*/
Page();
/*
* Constructor to save values for the
Page immediately
*
* Parameters:
* link -
URL object for this page
* desc -
description as found on the webpage
*/
Page(URL & link,
string & desc) : url(link), description(desc);
/*
* Parse the items on the page
*
* Calls the XMLParser for this page
*
*/
void
ParsePage();
/*
* Page Destructure
* delete the URL
*/
~Page();
public: // Getters
and Setters
/*
* get the URL
*
* Returns:
* url - URL object
*
*/
URL GetUrl()
{
return url;
}
/*
* set the URL from URL object
*
* Parameters
* link -
URL object
*
*/
void SetUrl(URL
& link)
{
url = new URL(link);
}
/*
* set the URL from a url string
*
* Parameters
* link -
string
*
*/
void
SetUrl(string & link)
{
url = new URL(link);
}
/*
* get the description
*
* Returns:
* description - string description
*
*/
string GetDescription()
{
return
description;
}
/*
* set the description from a string
description
*
* Parameters
* description - string
*
*/
void
SetDescription(string & desc)
{
description
= desc;
}
/*
* get the status (of type PageStatus)
*
* Returns:
* status - string description
*
*/
PageStatus GetStatus()
{
return status;
}
/*
* set the status from a PageStatus
arguement
*
* Parameters
* status -
string
*
*/
void
SetStatus(PageStatus current_status)
{
status =
current_status;
}
};
#endif
/*
*
URL.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef URL_H
#define URL_H
#include <string>
/*
* URLs have the following format:
*
<scheme>://<net_loc>/<path>;<params>?<query>#<fragment>
*
* This class stores the url and all of
its pieces
*
*/
class URL {
private:
// string object to store original full url
string url;
// The scheme/protocal of the page (i.e. http)
string scheme;
// Required portion, stores the domain of the URL
string net_loc;
// Optional path portion of the URL
string path;
// Optional paramaters part of the URL ;<params>
string params;
// Optional query portion of the URL
string query;
// Optional fragment portion of the URL
string fragment;
public:
/*
* No-argument constructor setup any
default values
* allocate space for a URL
*/
URL();
/*
* Constructor given the specified string
URL
* Call the method to parse and store the
URL
*
* Parameters:
* url -
string url to parse and save
*
*/
URL(string & url);
/*
* Copy constructor for the URL
* allocate space for a URL
*
* Parameters:
* url -
url object to copy
*
*/
URL(URL & url);
/*
* Simple destructure
* may not need to do anything special
since the string
* objects can all desctruct themselves
*/
~URL();
/*
* compare method for the URL, compares
other url with
* the url of the object it is called on
*
* Parameters:
* other -
Other URL object to compare with
*
* Returns:
* -4 if this url is not in the same
domain (net_loc)
* -3 if this url is in the same domain
(net_loc) but not the same path
* -2 same domain, same path, same params
* -1 same domain, same path, same
params, same query
* 0 same everything!
* 1,2,3,4 same as the negative value but
when the: other url > this url
*/
int compare(URL
& other);
public: // Getters
and Setters
// Removed to save paper, these are all just simple one
line functions
private:
/*
* method to parse and store the pieces
of a string url
*
* Parameters:
* url -
string url to parse
*
*/
void
parseURL(string & url);
};
#endif
/*
*
WordIndex.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef
WORDINDEX_H
#define
WORDINDEX_H
#include "Word.h"
/*
* Store index that maps words to pages
*
* This class is basically a BST of the
object type Word
* the words in the list all map to a
list of pages
*
*/
class WordIndex {
private:
// The root node in the WordIndex tree
Word * root;
// The count of words in the tree
int count;
// Internal value so save the location of the iteration
Word * currentWord;
public:
/*
* no-arg constructor for a word index
*/
WordIndex();
/*
* Destroy this wordset, Destroy the
words and set
* of pages, but the pages do not need to
be destroyed
* since the pages may still be pointed
to by other words
*
*/
~WordIndex();
/*
* method to add a word to the index of
indexed word
* only adds a new word to the index if
the page doesn't
* already exist in the index.
*
* If the word already exists, add the page
to the words
* set of mapped pages
*
* Parameters:
* word -
the word that was found
* page -
the page the word was found on
*
* Returns:
* pointer to the newly inserted node, or
NULL if the
* node already existed in the tree
*
*/
Word * Insert(string & word,
Page & page);
/*
* Get the next word in the list, when
start is set to true
* it will start the iteration over back
at the first node
* otherwise it will continue to return
the next word each
* time this method is called
*
*
* Parameters:
* start -
boolean, when set to true the iteration starts over
*
* Returns:
* word -
the next word object
*
*/
Word GetNext(bool start);
/*
* Assigment operator. Makes a complete
copy of its argument
*
* Parameters:
* other -
wordindex to copy
*
* Returns:
* Reference to oneself
*/
WordIndex& operator =(const WordIndex
& other)
{
Copy(other);
return *this;
}
public: // Getters
and Setters
// Removed to save paper, these are all just simple one
line functions
private:
/*
* Makes this object a copy of the other
BST
*
* Parameters:
* other -
WordIndex to copy
*
*/
void Copy(const WordIndex
& other);
};
#endif
/*
*
WordIndex.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef WORD_H
#define WORD_H
#include <string>
#include "PageSet.h"
/*
* Store index that maps words to pages
*
* This class is basically a BST of the
object type Word
* the words in the list all map to a
list of pages
*
*/
class Word {
private:
// the actual word string
string word;
// Not required: total count of times the word was
indexed
int count;
// Set of pages pointers this word points to
PageSet * pages;
public:
/*
* no-arg constructor for a word
*/
Word();
/*
* Word Constructor, creates a word for
the index
*
* Parameters:
* word -
the word (string) to index
* page -
Page object to add to the PageSet
*
*/
Word(string & word,
Page & page);
/*
* Destroy this word, Destroy the word
and set
* of pages, but the pages do not need to
be destroyed
* since the pages may still be pointed
to by other words
*
*/
~Word();
/*
* method to add a word to the index of
indexed word
* increments the word count
*
* Parameters:
* word -
the word that was found
* page -
the page the word was found on
*
* Returns:
* true if the page was added to the word
* false if the page was already indexed
*/
bool
InsertPage(Page & page);
/*
* Compare method for the word, will be
called
* by == operator, compares the words as
strings
*
* Parameters:
* other -
word object to compare with
*
* Returns:
* the same as the string compare method
*
*/
int
Compare(Word & other);
public: // Getters
and Setters
// Removed to save paper, these are all just simple one
line functions
};
#endif
/*
*
PageQueue.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef
PAGEQUEUE_H
#define
PAGEQUEUE_H
#include <string>
/*
* Doubly linked list of Page nodes, will
be used to
* Keep track of yet-to-be indexed pages
*/
class PageQueue
{
private:
// First page in the queue or the next one to be crawled
Page *
first;
// Last page in the queue or the one after which to
insert
Page *
last;
// Running count of the number of pages in the queue
int count;
public:
/*
*
No-arg constructor.
Initializes an empty linked list
*/
PageQueue()
:
first(NULL), last(NULL), count(0)
{
}
/*
*
Copy constructor. Makes a
complete copy of its argument
*/
PageQueue(const PageQueue
& other) :
first(NULL), last(NULL),
count(other.count)
{
Copy(other);
}
/*
*
Destructor
*/
~PageQueue()
{
Clear();
delete first;
delete last;
}
/*
* Assignment operator. Makes a complete copy of its argument
*
* Parameters:
* other -
PageQueue to copy
*
* Returns:
* reference to oneself
*/
PageQueue&
operator =(const PageQueue & other)
{
Copy(other);
return *this;
}
/*
* Check if the list is empty or not
*
* Returns:
* true if the list is empty
* false if the list is not empty
*/
bool IsEmpty() const
{
if( count == 0 ) return true;
else return false;
}
/*
*
Removes all values from the list
*/
void Clear();
/*
* Get the size of the queue
*
* Returns:
* the number of values in the list
*/
int GetCount() const
{
return count;
}
/*
* Gets first node in the list
*
* Returns:
* pointer to the first node in the list,
or NULL if the list is empty
*/
Page *
GetFirst() const
{
if( !IsEmpty()
)
{
return first;
}
else
{
return NULL;
}
}
/*
* Gets last node in the list
*
* Returns:
* pointer to the last node in the list,
or NULL if the list is empty
*/
Page *
GetLast() const
{
if( !IsEmpty()
)
{
return last;
}
else
{
return NULL;
}
}
/*
* Inserts value v into the list after
node n
*
* Parameters:
* v The new value being inserted
* n A node that is already in the list
after which the new node should
* be inserted.
* If n is NULL, the new node should be
inserted at the beginning of the list.
*
* Returns:
* pointer to the newly inserted node
*/
Page *
Insert(const std::string & v, LLNode * n);
/*
* Searches for the first occurrence of
value v that appears in the list
* after node n
*
* Parameters:
* v The value being searched for
* n The node in the list after which the
search should begin.
* If n is NULL, the list should be
searched from the beginning.
*
* Returns:
* pointer to the node containing v, or
NULL if v is not found
*/
Page *
Find(const std::string & v, LLNode * n) const;
/*
* Removes node n
from the list
*
* Parameters:
* n The node being removed from the list
*/
void Remove(Page
* n);
private:
/*
* Internal copy method to be called by
various other functions
* such as the copy constructor
*
* Parameters:
* other -
PageQueue to copy
*
* Returns:
* reference to oneself
*/
void Copy(const PageQueue
& other);
};
#endif
/*
*
PageSet.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef PAGESET_H
#define PAGESET_H
#include "Page.h"
/*
* Stores a set of pages, will be
implemented by the webcrawler
* to Keep track of indexed pages
(PageHistory object)
*
* This class is basically a BST of the
object type Page
* The page is going to be an external
object and only
* when the full PageHistory object is
deleted
* should all of the actual pages also be
cleaned up
* otherwise the destruction of a PageSet
will
* not include the destruction of the
Pages in the set
*
*/
class WordIndex {
private:
// The root node in the Page tree
Page * root;
// The count of pages in the tree
int count;
public:
/*
* no-arg constructor for a page set
*/
PageSet();
/*
* Destroy this pageset, Destroy the set
* of pages, but the pages do not need to
be destroyed
* since the pages may still be pointed
to by other sets
*
*/
~PageSet();
/*
* method to add a page to the set
* only adds a new page to the index if
the page doesn't
* already exist in the set.
*
* Parameters:
* page -
the page to insert
*
* Returns:
* pointer to the newly inserted node, or
NULL if the
* node already existed in the tree
*
*/
Page * Insert(Page
& page);
/*
* Copy Constructor. Makes a complete
copy of its argument
*
* Parameters:
* other -
PageSet to copy
*
*/
PageSet(const PageSet
& other) : root(NULL), count(other.count)
{
Copy(other);
}
/*
* Assigment operator. Makes a complete
copy of its argument
*
* Parameters:
* other -
PageSet to copy
*
* Returns:
* Reference to oneself
*/
PageSet& operator =(const PageSet
& other)
{
Copy(other);
return *this;
}
/*
* Function to check if the set is empty
or not
*
* Returns:
* true if the BST is empty
* false if the BST is not empty
*
*/
bool IsEmpty() const
{
if( count
<= 0 ) return true;
else return false;
}
/* Searches the tree for the page p
*
* Parameters:
* page -
The page being searched for
*
* Returns:
* pointer to the node containing p, or
NULL if p is not in the tree
*
*/
Page * Find(const Page &
page) const;
/*
* Get the next page in the set, when
start is set to true
* it will start the iteration over back
at the first node
* otherwise it will continue to return
the next page each
* time this method is called
*
* Parameters:
* start -
boolean, when set to true the iteration starts over
*
* Returns:
* page -
the next Page object
*
*/
Page GetNext(bool start);
public: // Getters
and Setters
// Removed to save paper, these are all just simple one
line functions
private:
/*
* Makes this object a copy of the other
BST
*
* Parameters:
* other -
PageSet to copy
*
*/
void Copy(const PageSet
& other);
};
#endif
/*
*
PageStatus.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef
PAGESTATUS_H
#define PAGESTATUS_H
/*
* simple enumerated list to help keep
track of the current state
* of a page, regardless of the current
lists it is a part of
*/
enum PageStatus { DOWNLOADED, PARSED, PENDING, PROCESSED,
ARCHIVED };
#endif
/*
*
StopWords.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/19/09.
*
Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef
STOPWORDS_H
#define
STOPWORDS_H
#include <string>
/*
* Load and store stop words
*
* This class is just a wrapper around an
array of strings
* with some methods optimized for the
fact that the stop
* words will be inserted in alphebetical
ascending order
*
*/
class StopWords {
private:
// The array of stop words
Page * stops[];
// The count of stop words
int count;
public:
/*
* no-arg constructor
*/
StopWords();
/*
* Destroy this set of stopwords, Destroy
the array of words
* and make sure all of the strings are
also deleted
*
*/
~StopWords();
/*
* method to add a word to the array
*
* Parameters:
* word -
the word to insert
*
* Returns:
* index to the newly inserted node
*
*/
int * Insert(string &
word);
/*
* Function to check if the array is
empty or not
*
* Returns:
* true if the array is empty
* false if the array is not empty
*
*/
bool IsEmpty() const
{
if( count
<= 0 ) return true;
else return false;
}
/*
* Searches the array for the word, this
algorithm
* can do this in a binary search method
knowing the
* array is inserted in alphebetical
ascending order
*
* Parameters:
* word -
The word (string) being searched for
*
* Returns:
* true if the word is NOT in the list
* false if the word IS in the list
*
*/
bool * Find(const string & word)
const;
public: // Getters
and Setters
// Removed to save paper, these are all just simple one
line functions
private:
};
#endif
/*
*
RobotFilter.h
*
webcrawler-proj
*
*
Created by Steven Hansen on 2/11/09.
* Copyright 2009 Spatical Company. All rights reserved.
*
*/
#ifndef
ROBOTFILTER_H
#define
ROBOTFILTER_H
#include <iostream>
#include <fstream>
using namespace std;
#include <cstring>
#define SIZE 200
/*
* simple robot filter program
* this is my second iteration I made
while studying
* for the programming exam
*/
class RobotFilter
{
private:
// array of char strings
char *
URLs[SIZE];
// number of urls in this filter
int URLcount;
public:
/*
* RobotFilter Constructor
*
*/
RobotFilter() :
URLcount(0)
{
return;
}
/*
* LoadExclusionFile -- Load URLs from
file
* Required Parameters:
* input
stream
*
*/
bool
LoadExclusionFile(ifstream &file);
/*
* IsUrlPathExcluded -- Checks if given
urlPath is excluded by robots.txt fle
*
* Parameters:
* urlPath - the path to check
*
* Returns:
*
*
*/
bool
IsUrlPathExluded(const char * urlPath) const;
};
#endif
/*