Steven Hansen

5776-22222


 

Table of Contents

Data Structures................................................................................................................... 3

Data structure for tracking unprocessed pages......................................................................................................... 3

Data structure for tracking already processed pages............................................................................................ 3

Data structure for managing stop words....................................................................................................................... 3

Data structure for storing word/page index................................................................................................................ 3

Data structure for implementing robot filter.............................................................................................................. 3

Class Responsibilities.......................................................................................................... 4

Drive the overall crawling process.................................................................................................................................... 4

WebCrawler.h.............................................................................................................................................................................. 4

Store the URL and description for a page........................................................................................................................ 6

Page.h............................................................................................................................................................................................... 6

URL.h................................................................................................................................................................................................ 8

Store index that maps words to pages........................................................................................................................... 11

WordIndex.h............................................................................................................................................................................. 11

Word.h.......................................................................................................................................................................................... 12

Keep track of yet-to-be indexed pages........................................................................................................................... 15

PageQueue.h............................................................................................................................................................................. 15

Keep track of already indexed pages............................................................................................................................... 19

PageSet.h..................................................................................................................................................................................... 19

PageStatus.h............................................................................................................................................................................. 21

Load and store stop words................................................................................................................................................... 22

StopWords.h.............................................................................................................................................................................. 22

Robot Filter..................................................................................................................................................................................... 24

RobotFilter.h............................................................................................................................................................................ 24

Distinguish between HTML and non-HTML links................................................................................................. 26

HTMLFileFilter.h.................................................................................................................................................................... 26

Distinguish between in-scope and out-of-scope links........................................................................................ 28

URLScopeFilter.h................................................................................................................................................................... 28

Resolve relative URLs.............................................................................................................................................................. 30

URLResolver.h.......................................................................................................................................................................... 30

Parses words, links, and description from HTML pages................................................................................... 32

XMLParser.h.............................................................................................................................................................................. 32

Populate word index................................................................................................................................................................. 35

WordParser.h............................................................................................................................................................................ 35

Generate XML output file........................................................................................................................................................ 37

OuputGenerator.h.................................................................................................................................................................. 37

Algorithms......................................................................................................................... 39

Main driver for the crawling process............................................................................................................................ 39

HTML parser.................................................................................................................................................................................. 39

XML output page generation................................................................................................................................................ 39

 


Data Structures

Detailed description of the following data structures, including justification for each choice

 

Data structure for tracking unprocessed pages

In my PageQueue I will implement my LinkedList.  All this data structure needs to do is get the front Page off of the list and return it.  It also needs to append Pages to the other end of the list as well.  No searching over this data structure should be needed.

Data structure for tracking already processed pages

The PageHistory will definitely be a BST.  The program will have to continuously add pages to the history (no remove method will be required), and then it will need to have a method to query the tree to see if a Page has already been hit.  This will occur very often in the program so a data structure for fast searching will be a requirement.

 

Data structure for managing stop words

For the stop words I will probably just use an array with a search method that takes advantage of the fact that the stop words should be given to us in ascending order.  The BST as I have already implemented it would not be adequate, as it would just turn into a linkedlist since the words would be inserted in order.

 

Data structure for storing word/page index

The WordIndex, which will include a map to the pages, will need to be a BST.  The WordIndex will need add, iterate, and find methods that will all need to work optimally to search over the tree.  The word class will include a PageSet object that will be a BST of Pages the word appeared on.  That PageSet list will also need to have a find method and a iterator.   As a set it will obviously need to be unique entries.

 

Data structure for implementing robot filter

As the robots.txt files are usually decently short, I will keep my RobotFilter class with the same data structure with which I originally built it. Just a short array of URL objects that the IsUrlPathExluded method can just iterate over as an array should be fast enough for my needs.

 


Class Responsibilities

Drive the overall crawling process

WebCrawler.h

/*

 *  WebCrawler.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef WEBCRAWLER_H

#define WEBCRAWLER_H

 

#include <string>

#include <iostream>

using namespace std;

 

#include "WordIndex.h"

#include "PageHistory.h"

#include "PageQueue.h"

#include "OutputGenerator.h"

 

/*

 * Drive the overall crawling process

 *

 * This will be the main class in the webcrawler program.

 * It will contain the main loop that will keep the

 * program running over the entire scope of the crawl.

 *

 * A web crawler is an automated program that traverses the world wide web by

 * retrieving linked pages while collecting certain information. For this

 * project you will implement a simple web crawler that will traverse a site

 * specified by a given URL (Uniform Resource Locator), collecting all of the

 * textual words, the page they were found on, and a short description of that

 * page. It will output the collected data into a specified format which may

 * then be used to generate an index of the site.

 *

 */

class WebCrawler {

private:

     // Tree of all indexed words

     WordIndex * words;

    

     // Tree of all indexed pages

     PageHistory * pages;

    

     // List of pages yet to be indexed

     PageQueue * queue;

    

public:

     /*

      * No-argument constructor setup any default values

      * the private objects need to be initialized ( call Init() )

      */

     WebCrawler();

    

     /*

      * Destructure that will initiate the destruction

      * of basically the entire program

      */

     ~WebCrawler();

    

     /*

      * Load Stop words takes the filename and loads the stop words

      * this will pretty much just pass the filename to the

      * WordIndex object so it can be added to the stop words list

      *

      * Parameters:

      * filename      - the filename of the stopwords file to load

      *

      */

     void loadStopWords(string & filename);

    

     /*

      * Main crawl loop function

      * Select a page that has not yet been indexed (the first will be the start url)

      * Download the selected page

      * For all text areas in the page, parse out all of the words.

      *

      * For all <A> tags with href attributes and all <FRAME> and <IFRAME>

      * tags with src attributes, add the URLs in the href or src attributes

      * to the set of pages that still need to be indexed

      * (but only if they're HTML files with the same prefix as the start URL)

      *

      * Save summary information for the page

      *

      * Repeat until there are no pages left to index

      *

      * Parameters:

      * start  - The first url, the start location for the crawl

      *

      */

     void crawl(string & start);

    

     /*

      * This method saves the program output to the give filename/location

      * This method will not produce anything interesting if called before

      * crawl(). This method will use the OutputGenerator class

      *

      * Parameters:

      * filename      - the filename/location of the generated output file

      *

      */

     void outputfile(string & filename);

    

private:

     /*

      * Initialization function for the internal private objects

      */

     void Init();

};

#endif


Store the URL and description for a page

Page.h

/*

 *  Page.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef PAGE_H

#define PAGE_H

 

#include <string>

 

#include "URL.h"

#include "PageStatus.h"

#include "XMLParser.h"

 

/*

 * Page represents a webpage

 * Store the URL and description for a page

 *

 * Also needs to handle exceptions when a page unable to download

 *

 */

class Page {

private:

     // url object that holds and operates with the urls

     URL * url;

    

     // just a string desription of the page

     string description;

    

     // variable to keep track of the status of the page

     PageStatus status;

    

public:

     /*

      * No-argument constructor setup any default values

      * allocate space for a URL

      */

     Page();

    

    

     /*

      * Constructor to save values for the Page immediately

      *

      * Parameters:

      * link   - URL object for this page

      * desc   - description as found on the webpage

      */

     Page(URL & link, string & desc) : url(link), description(desc);

    

     /*

      * Parse the items on the page

      *

      * Calls the XMLParser for this page

      *

      */

     void ParsePage();

    

     /*

      * Page Destructure

      * delete the URL

      */

     ~Page();

    

public: // Getters and Setters

     /*

      * get the URL

      *

      * Returns:

      * url - URL object

      *

      */

     URL GetUrl()

     {

       return url;

     }

    

     /*

      * set the URL from URL object

      *

      * Parameters

      * link   - URL object

      *

      */

     void SetUrl(URL & link)

     {

       url = new URL(link);

     }

    

     /*

      * set the URL from a url string

      *

      * Parameters

      * link   - string

      *

      */

     void SetUrl(string & link)

     {

       url = new URL(link);

     }

    

     /*

      * get the description

      *

      * Returns:

      * description - string description

      *

      */

     string GetDescription()

     {

       return description;

     }

    

     /*

      * set the description from a string description

      *

      * Parameters

      * description   - string

      *

      */

     void SetDescription(string & desc)

     {

       description = desc;

     }

    

     /*

      * get the status (of type PageStatus)

      *

      * Returns:

      * status - string description

      *

      */

     PageStatus GetStatus()

     {

       return status;

     }

    

     /*

      * set the status from a PageStatus arguement

      *

      * Parameters

      * status - string

      *

      */

     void SetStatus(PageStatus current_status)

     {

       status = current_status;

     }

};

 

#endif

URL.h

/*

 *  URL.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef URL_H

#define URL_H

 

#include <string>

 

 

/*

 * URLs have the following format:

 * <scheme>://<net_loc>/<path>;<params>?<query>#<fragment>

 *

 * This class stores the url and all of its pieces

 *

 */

class URL {

private:

     // string object to store original full url

     string url;

    

     // The scheme/protocal of the page (i.e. http)

     string scheme;

    

     // Required portion, stores the domain of the URL

     string net_loc;

    

     // Optional path portion of the URL

     string path;

    

     // Optional paramaters part of the URL ;<params>

     string params;

    

     // Optional query portion of the URL

     string query;

    

     // Optional fragment portion of the URL

     string fragment;

public:

     /*

      * No-argument constructor setup any default values

      * allocate space for a URL

      */

     URL();

    

     /*

      * Constructor given the specified string URL

      * Call the method to parse and store the URL

      *

      * Parameters:

      * url    - string url to parse and save

      *

      */

     URL(string & url);

    

     /*

      * Copy constructor for the URL

      * allocate space for a URL

      *

      * Parameters:

      * url    - url object to copy

      *

      */

     URL(URL & url);

    

     /*

      * Simple destructure

      * may not need to do anything special since the string

      * objects can all desctruct themselves

      */

     ~URL();

    

     /*

      * compare method for the URL, compares other url with

      * the url of the object it is called on

      *

      * Parameters:

      * other  - Other URL object to compare with

      *

      * Returns:

      * -4 if this url is not in the same domain (net_loc)

      * -3 if this url is in the same domain (net_loc) but not the same path

      * -2 same domain, same path, same params

      * -1 same domain, same path, same params, same query

      * 0 same everything!

      * 1,2,3,4 same as the negative value but when the: other url > this url

      */

     int compare(URL & other);

    

public: // Getters and Setters

     // Removed to save paper, these are all just simple one line functions

 

private:

     /*

      * method to parse and store the pieces of a string url

      *

      * Parameters:

      * url    - string url to parse

      *

      */

     void parseURL(string & url);

};

 

#endif

 


Store index that maps words to pages

WordIndex.h

/*

 *  WordIndex.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef WORDINDEX_H

#define WORDINDEX_H

 

#include "Word.h"

/*

 * Store index that maps words to pages

 *

 * This class is basically a BST of the object type Word

 * the words in the list all map to a list of pages

 *

 */

class WordIndex {

private:

     // The root node in the WordIndex tree

     Word * root;

    

     // The count of words in the tree

     int count;

    

     // Internal value so save the location of the iteration

     Word * currentWord;

    

public:

     /*

      * no-arg constructor for a word index

      */

     WordIndex();

    

     /*

      * Destroy this wordset, Destroy the words and set

      * of pages, but the pages do not need to be destroyed

      * since the pages may still be pointed to by other words

      *

      */

     ~WordIndex();

    

     /*

      * method to add a word to the index of indexed word

      * only adds a new word to the index if the page doesn't

      * already exist in the index.

      *

      * If the word already exists, add the page to the words

      * set of mapped pages

      *

      * Parameters:

      * word   - the word that was found

      * page   - the page the word was found on

      *

      * Returns:

      * pointer to the newly inserted node, or NULL if the

      * node already existed in the tree

      *

      */

     Word * Insert(string & word, Page & page);

    

     /*

      * Get the next word in the list, when start is set to true

      * it will start the iteration over back at the first node

      * otherwise it will continue to return the next word each

      * time this method is called

      *

      *

      * Parameters:

      * start  - boolean, when set to true the iteration starts over

      *

      * Returns:

      * word          - the next word object

      *

      */

     Word GetNext(bool start);

    

     /*

      * Assigment operator. Makes a complete copy of its argument

      *

      * Parameters:

      * other  - wordindex to copy

      *

      * Returns:

      * Reference to oneself

      */

     WordIndex& operator =(const WordIndex & other)

     {

       Copy(other);

       return *this;

     }

    

public: // Getters and Setters

     // Removed to save paper, these are all just simple one line functions

    

private:

     /*

      * Makes this object a copy of the other BST

      *

      * Parameters:

      * other  - WordIndex to copy

      *

      */

     void Copy(const WordIndex & other);

};

 

#endif

 

Word.h

/*

 *  WordIndex.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef WORD_H

#define WORD_H

 

#include <string>

 

#include "PageSet.h"

 

/*

 * Store index that maps words to pages

 *

 * This class is basically a BST of the object type Word

 * the words in the list all map to a list of pages

 *

 */

class Word {

private:

     // the actual word string

     string word;

    

     // Not required: total count of times the word was indexed

     int count;

    

     // Set of pages pointers this word points to

     PageSet * pages;

    

public:

     /*

      * no-arg constructor for a word

      */

     Word();

    

     /*

      * Word Constructor, creates a word for the index

      *

      * Parameters:

      * word   - the word (string) to index

      * page   - Page object to add to the PageSet

      *

      */

     Word(string & word, Page & page);

    

     /*

      * Destroy this word, Destroy the word and set

      * of pages, but the pages do not need to be destroyed

      * since the pages may still be pointed to by other words

      *

      */

     ~Word();

    

     /*

      * method to add a word to the index of indexed word

      * increments the word count

      *

      * Parameters:

      * word   - the word that was found

      * page   - the page the word was found on

      *

      * Returns:

      * true if the page was added to the word

      * false if the page was already indexed

      */

     bool InsertPage(Page & page);

    

     /*

      * Compare method for the word, will be called

      * by == operator, compares the words as strings

      *

      * Parameters:

      * other  - word object to compare with

      *

      * Returns:

      * the same as the string compare method

      *

      */

     int Compare(Word & other);

    

public: // Getters and Setters

     // Removed to save paper, these are all just simple one line functions

};

 

#endif

 


Keep track of yet-to-be indexed pages

PageQueue.h

/*

 *  PageQueue.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef PAGEQUEUE_H

#define PAGEQUEUE_H

 

#include <string>

 

 

/*

 * Doubly linked list of Page nodes, will be used to

 * Keep track of yet-to-be indexed pages

 */

class PageQueue

     {

     private:

       // First page in the queue or the next one to be crawled

       Page * first;

      

       // Last page in the queue or the one after which to insert

       Page * last;

      

       // Running count of the number of pages in the queue

       int count;

      

     public:

      

       /*

        *  No-arg constructor.  Initializes an empty linked list

        */

       PageQueue() :

       first(NULL), last(NULL), count(0)

       {

       }

      

      

      

       /*

        *  Copy constructor.  Makes a complete copy of its argument

        */

       PageQueue(const PageQueue & other) :

       first(NULL), last(NULL), count(other.count)

       {

              Copy(other);

       }

      

      

       /*

        *  Destructor

        */

       ~PageQueue()

       {

              Clear();

              delete first;

              delete last;

       }

      

      

       /*

        * Assignment operator.  Makes a complete copy of its argument

        *

        * Parameters:

        * other      - PageQueue to copy

        *

        * Returns:

        * reference to oneself

        */

       PageQueue& operator =(const PageQueue & other)

       {

              Copy(other);

              return *this;

       }

      

       /*

        * Check if the list is empty or not

        *

        * Returns:

        * true if the list is empty

        * false if the list is not empty

        */

       bool IsEmpty() const

       {

              if( count == 0 ) return true;

              else return false;

       }

      

       /*

        *  Removes all values from the list

        */

       void Clear();

      

       /*

        * Get the size of the queue

        *

        * Returns:

        * the number of values in the list

        */

       int GetCount() const

       {

              return count;

       }

      

      

      

       /*

        * Gets first node in the list

        *

        * Returns:

        * pointer to the first node in the list, or NULL if the list is empty

        */

       Page * GetFirst() const

       {

              if( !IsEmpty() )

              {

                      return first;

              }

              else

              {

                      return NULL;

              }

       }

      

      

      

       /*

        * Gets last node in the list

        *

        * Returns:

        * pointer to the last node in the list, or NULL if the list is empty

        */

       Page * GetLast() const

       {

              if( !IsEmpty() )

              {

                      return last;

              }

              else

              {

                      return NULL;

              }

       }

      

      

       /*

        * Inserts value v into the list after node n

        * 

        * Parameters:

        * v The new value being inserted

        * n A node that is already in the list after which the new node should

        *      be inserted.

        *      If n is NULL, the new node should be inserted at the beginning of the list.

        *

        * Returns:

        * pointer to the newly inserted node

        */

       Page * Insert(const std::string & v, LLNode * n);

      

      

       /*

        * Searches for the first occurrence of value v that appears in the list

        *   after node n

        *  

        * Parameters:

        * v The value being searched for

        * n The node in the list after which the search should begin.

        *      If n is NULL, the list should be searched from the beginning.

        *

        * Returns:

        * pointer to the node containing v, or NULL if v is not found

        */

       Page * Find(const std::string & v, LLNode * n) const;

      

       /*

        * Removes node n from the list

        * 

        * Parameters:

        * n The node being removed from the list

        */

       void Remove(Page * n);

      

     private:

       /*

        * Internal copy method to be called by various other functions

        * such as the copy constructor

        *

        * Parameters:

        * other      - PageQueue to copy

        *

        * Returns:

        * reference to oneself

        */

       void Copy(const PageQueue & other);

      

     };

 

 

#endif


Keep track of already indexed pages

PageSet.h

/*

 *  PageSet.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

 

#ifndef PAGESET_H

#define PAGESET_H

 

#include "Page.h"

/*

 * Stores a set of pages, will be implemented by the webcrawler

 * to Keep track of indexed pages (PageHistory object)

 *

 * This class is basically a BST of the object type Page

 * The page is going to be an external object and only

 * when the full PageHistory object is deleted

 * should all of the actual pages also be cleaned up

 * otherwise the destruction of a PageSet will

 * not include the destruction of the Pages in the set

 *

 */

class WordIndex {

private:

     // The root node in the Page tree

     Page * root;

    

     // The count of pages in the tree

     int count;

    

public:

     /*

      * no-arg constructor for a page set

      */

     PageSet();

    

     /*

      * Destroy this pageset, Destroy the set

      * of pages, but the pages do not need to be destroyed

      * since the pages may still be pointed to by other sets

      *

      */

     ~PageSet();

    

     /*

      * method to add a page to the set

      * only adds a new page to the index if the page doesn't

      * already exist in the set.

      *

      * Parameters:

      * page   - the page to insert

      *

      * Returns:

      * pointer to the newly inserted node, or NULL if the

      * node already existed in the tree

      *

      */

     Page * Insert(Page & page);

    

    

     /*

      * Copy Constructor. Makes a complete copy of its argument

      *

      * Parameters:

      * other  - PageSet to copy

      *

      */

     PageSet(const PageSet & other) : root(NULL), count(other.count)

     {

       Copy(other);

     }

    

     /*

      * Assigment operator. Makes a complete copy of its argument

      *

      * Parameters:

      * other  - PageSet to copy

      *

      * Returns:

      * Reference to oneself

      */

     PageSet& operator =(const PageSet & other)

     {

       Copy(other);

       return *this;

     }

    

     /*

      * Function to check if the set is empty or not

      *

      * Returns:

      * true if the BST is empty

      * false if the BST is not empty

      *

      */

     bool IsEmpty() const

     {

       if( count <= 0 ) return true;

       else return false;

     }

    

     /* Searches the tree for the page p

      * 

      * Parameters:

      * page          - The page being searched for

      *

      * Returns:

      * pointer to the node containing p, or NULL if p is not in the tree

      *

      */

     Page * Find(const Page & page) const;

    

     /*

      * Get the next page in the set, when start is set to true

      * it will start the iteration over back at the first node

      * otherwise it will continue to return the next page each

      * time this method is called

      *

      * Parameters:

      * start  - boolean, when set to true the iteration starts over

      *

      * Returns:

      * page          - the next Page object

      *

      */

     Page GetNext(bool start);

    

public: // Getters and Setters

     // Removed to save paper, these are all just simple one line functions

    

private:

     /*

      * Makes this object a copy of the other BST

      *

      * Parameters:

      * other  - PageSet to copy

      *

      */

     void Copy(const PageSet & other);

};

 

 

#endif

 

PageStatus.h

/*

 *  PageStatus.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef PAGESTATUS_H

#define PAGESTATUS_H

 

/*

 * simple enumerated list to help keep track of the current state

 * of a page, regardless of the current lists it is a part of

 */

enum PageStatus { DOWNLOADED, PARSED, PENDING, PROCESSED, ARCHIVED };

 

 

#endif


Load and store stop words

StopWords.h

/*

 *  StopWords.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/19/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

 

#ifndef STOPWORDS_H

#define STOPWORDS_H

 

#include <string>

/*

 * Load and store stop words

 *

 * This class is just a wrapper around an array of strings

 * with some methods optimized for the fact that the stop

 * words will be inserted in alphebetical ascending order

 *

 */

class StopWords {

private:

     // The array of stop words

     Page * stops[];

    

     // The count of stop words

     int count;

    

public:

     /*

      * no-arg constructor

      */

     StopWords();

    

     /*

      * Destroy this set of stopwords, Destroy the array of words

      * and make sure all of the strings are also deleted

      *

      */

     ~StopWords();

    

     /*

      * method to add a word to the array

      *

      * Parameters:

      * word   - the word to insert

      *

      * Returns:

      * index to the newly inserted node

      *

      */

     int * Insert(string & word);

    

     /*

      * Function to check if the array is empty or not

      *

      * Returns:

      * true if the array is empty

      * false if the array is not empty

      *

      */

     bool IsEmpty() const

     {

       if( count <= 0 ) return true;

       else return false;

     }

    

     /*

      * Searches the array for the word, this algorithm

      * can do this in a binary search method knowing the

      * array is inserted in alphebetical ascending order

      * 

      * Parameters:

      * word   - The word (string) being searched for

      *

      * Returns:

      * true if the word is NOT in the list

      * false if the word IS in the list

      *

      */

     bool * Find(const string & word) const;

    

    

public: // Getters and Setters

     // Removed to save paper, these are all just simple one line functions

    

private:

};

 

#endif

 


Robot Filter

RobotFilter.h

/*

 *  RobotFilter.h

 *  webcrawler-proj

 *

 *  Created by Steven Hansen on 2/11/09.

 *  Copyright 2009 Spatical Company. All rights reserved.

 *

 */

 

#ifndef ROBOTFILTER_H

#define ROBOTFILTER_H

 

#include <iostream>

#include <fstream>

using namespace std;

 

#include <cstring>

#define SIZE 200

 

/*

 * simple robot filter program

 * this is my second iteration I made while studying

 * for the programming exam

 */

class RobotFilter

{

private:

     // array of char strings

     char * URLs[SIZE];

    

     // number of urls in this filter

     int URLcount;

 

public:

     /*

      * RobotFilter Constructor

      *

      */

     RobotFilter() : URLcount(0)

     {

       return;

     }

    

     /*

      * LoadExclusionFile -- Load URLs from file

      * Required Parameters:

      * input stream

      *

      */

     bool LoadExclusionFile(ifstream &file);

    

     /*

      * IsUrlPathExcluded -- Checks if given urlPath is excluded by robots.txt fle

      *

      * Parameters:

      * urlPath - the path to check

      *

      * Returns:

      *

      *

      */

     bool IsUrlPathExluded(const char * urlPath) const;

    

    

};

 

#endif


Distinguish between HTML and non-HTML links

HTMLFileFilter.h

/*