Вы находитесь на странице: 1из 54

<!

DOCTYPE html>
<html lang="en">
<!-- __ _ _ _ __| |_ (_)__ _____
/ _` | '_/ _| ' \| |\ V / -_)
\__,_|_| \__|_||_|_| \_/\___| -->
<head>
<title>Full text of &quot;History of Internet Archive&quot;</title>

<meta name="viewport" content="width=device-width, initial-scale=1.0"/>

<meta name="google-site-verification"
content="Q2YSouphkkgHkFNP7FgAkc4TmBs1Gmag3uGNndb53B8" />
<meta name="google-site-verification"
content="bpjKvUvsX0lxfmjg19TLblckWkDpnptZEYsBntApxUk" />

<script>window.archive_setup=[]</script>
<meta charset="UTF-8"> <script src="//archive.org/includes/jquery-
1.10.2.min.js?v1.10.2" type="text/javascript"></script>
<script src="//archive.org/includes/analytics.js?v=ea309d5"
type="text/javascript"></script>

<script>
'use strict';
if ('archive_analytics' in window) {
archive_analytics.service = "ao_2";

archive_analytics.send_pageview_on_load({
mediaType: "texts",
});

archive_analytics.process_url_events(window.location);
}
</script>
<script src="//archive.org/includes/bootstrap.min.js?v3.0.0"
type="text/javascript"></script>
<script src="//archive.org/components/setUpCopyableTexts/clipboard.js?
v=ea309d5" type="text/javascript"></script>
<script src="//archive.org/includes/build/js/polyfill.min.js?v=ea309d5"
type="text/javascript"></script>
<script
src="//archive.org/includes/build/npm/react/umd/react.production.min.js?v16.7.0"
type="text/javascript"></script>
<script src="//archive.org/includes/build/npm/react-dom/umd/react-
dom.production.min.js?vv16.7.0" type="text/javascript"></script>
<script src="//archive.org/includes/build/js/archive.min.js?v=ea309d5"
type="text/javascript"></script>
<script src="//archive.org/includes/build/js/areact.min.js?v=ea309d5"
type="text/javascript"></script>
<link href="//archive.org/includes/build/css/archive.min.css?v=ea309d5"
rel="stylesheet" type="text/css"/>
<link rel="SHORTCUT ICON" href="https://archive.org/images/glogo.jpg"/>
</head>
<body class="navia ">
<a href="#maincontent" class="hidden-for-screen-readers">Skip to main
content</a>

<!-- Wraps all page content -->


<div id="wrap"
>
<div id="navwrap1">
<div id="navwrap2">

<div id="nav-tophat" class="collapse">


<div class="row
toprow web" style="max-width:1000px;margin:auto;">
<div class="col-xs-12">
<div class="wayback-txt">
Search the history of over 371 billion
<a style="display:inline"
data-event-click-tracking="TopNav|
WaybackMachineStatsLink"
href="https://blog.archive.org/2016/10/23/defining-web-
pages-web-sites-and-web-captures/"
>web pages</a> on the Internet.
</div>
<div class="roundbox7 wayback-main">
<div class="row">
<div class="col-sm-6" style="padding-left:0; padding-
right:0;">
<a style="padding-bottom:0"
data-event-click-tracking="TopNav|
WaybackMachineLogoLink"
href="https://archive.org/web/"
><img
src="https://archive.org/images/WaybackLogoSmall.png" alt="Wayback Machine"/></a>
</div>
<div class="col-sm-6" style="padding-top:13px;">
<form style="position:relative;">
<span class="iconochive-search" aria-
hidden="true"></span><span class="sr-only">search</span>
<label for="nav-wb-url" class="sr-only">Search the Wayback Machine</label>
<input id="nav-wb-url" class="form-control input-sm
roundbox20" type="text"
placeholder="enter URL or keywords" name="url"
autocomplete="off"
onclick="$(this).css('padding-
left','').parent().find('.iconochive-search').hide()"/>
</form>
</div>
</div><!--/.row-->
</div><!--/.wayback-main-->
</div>
</div><!--./row-->

<div class="row toprow fivecolumns texts">

<div class="col-sm-
2 col-xs-7 col-sm-push-4">
<div class="linx">
<h5>Featured</h5>
<a
href="https://archive.org/details/texts"
data-event-click-tracking="TopNav|FeaturedLink-
AllTexts"
><span class="iconochive-texts" aria-
hidden="true"></span><span class="sr-only">texts</span> All Texts</a>
<a
href="https://archive.org/search.php?
query=mediatype:texts&sort=-publicdate"
data-event-click-tracking="TopNav|FeaturedLink-
ThisJustInTexts"
><span class="iconochive-latest" aria-
hidden="true"></span><span class="sr-only">latest</span> This Just In</a>
<a
href="https://archive.org/details/smithsonian" title="Smithsonian Libraries" data-
event-click-tracking="TopNav|FeaturedLink-SmithsonianLibraries">Smithsonian
Libraries</a> <a
href="https://archive.org/details/fedlink" title="FEDLINK (US)" data-event-click-
tracking="TopNav|FeaturedLink-FEDLINKUS">FEDLINK (US)</a>
<a href="https://archive.org/details/genealogy" title="Genealogy" data-event-click-
tracking="TopNav|FeaturedLink-Genealogy">Genealogy</a>
<a href="https://archive.org/details/lincolncollection" title="Lincoln Collection"
data-event-click-tracking="TopNav|FeaturedLink-LincolnCollection">Lincoln
Collection</a> <a
href="https://archive.org/details/additional_collections" title="Additional
Collections" data-event-click-tracking="TopNav|FeaturedLink-
AdditionalCollections">Additional Collections</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-2">
<div class="widgets">
<center class="items_list">
<div class="items_list_img">
<a
href="https://archive.org/details/inlibrary?sort=-publicdate"
style="background-image: url('https://archive.org/images/book-
lend.png');"
aria-hidden="true"
data-event-click-tracking="TopNav|CircleWidget-BooksToBorrow"
></a>
</div>
<a
class="stealth boxy-label"
data-event-click-tracking="TopNav|CircleWidget-BooksToBorrow"
href="https://archive.org/details/inlibrary?sort=-publicdate"
>Books to Borrow</a>
</center>
</div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7 col-sm-push-2">
<div class="linx ">
<h5>Top</h5>

<a href="https://archive.org/details/americana" title="American Libraries" data-


event-click-tracking="TopNav|TopLink-AmericanLibraries">American Libraries</a>
<a href="https://archive.org/details/toronto" title="Canadian Libraries" data-
event-click-tracking="TopNav|TopLink-CanadianLibraries">Canadian Libraries</a>
<a href="https://archive.org/details/universallibrary" title="Universal Library"
data-event-click-tracking="TopNav|TopLink-UniversalLibrary">Universal Library</a>
<a href="https://archive.org/details/opensource" title="Community Texts" data-
event-click-tracking="TopNav|TopLink-CommunityTexts">Community Texts</a>
<a href="https://archive.org/details/gutenberg" title="Project Gutenberg" data-
event-click-tracking="TopNav|TopLink-ProjectGutenberg">Project Gutenberg</a>
<a href="https://archive.org/details/biodiversity" title="Biodiversity Heritage
Library" data-event-click-tracking="TopNav|TopLink-
BiodiversityHeritageLibrary">Biodiversity Heritage Library</a>
<a href="https://archive.org/details/iacl" title="Children's Library" data-event-
click-tracking="TopNav|TopLink-ChildrenSLibrary">Children's Library</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-4">
<div class="widgets">
<center class="items_list">
<div class="items_list_img">
<a
href="https://openlibrary.org"
style="background-image:
url('https://archive.org/images/widgetOL.png');"
aria-hidden="true"
data-event-click-tracking="TopNav|CircleWidget-OpenLibrary"
></a>
</div>
<a
class="stealth boxy-label"
data-event-click-tracking="TopNav|CircleWidget-OpenLibrary"
href="https://openlibrary.org"
>Open Library</a>
</center>
</div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7">
<div class="linx linx-topped">

<a href="https://archive.org/details/booksbylanguage" title="Books by Language"


data-event-click-tracking="TopNav|TopLink-BooksByLanguage">Books by Language</a>
</div>

</div><!--/.col-sm-2-->

</div><!--/.row-->

<div class="row toprow fivecolumns movies">

<div class="col-sm-
2 col-xs-7 col-sm-push-4">
<div class="linx">
<h5>Featured</h5>
<a
href="https://archive.org/details/movies"
data-event-click-tracking="TopNav|FeaturedLink-
AllVideo"
><span class="iconochive-movies" aria-
hidden="true"></span><span class="sr-only">movies</span> All Video</a>
<a
href="https://archive.org/search.php?
query=mediatype:movies&sort=-publicdate"
data-event-click-tracking="TopNav|FeaturedLink-
ThisJustInVideo"
><span class="iconochive-latest" aria-
hidden="true"></span><span class="sr-only">latest</span> This Just In</a>
<a
href="https://archive.org/details/prelinger" title="Prelinger Archives" data-event-
click-tracking="TopNav|FeaturedLink-PrelingerArchives">Prelinger Archives</a>
<a href="https://archive.org/details/democracy_now_vid" title="Democracy Now!"
data-event-click-tracking="TopNav|FeaturedLink-DemocracyNow">Democracy Now!</a>
<a href="https://archive.org/details/occupywallstreet" title="Occupy Wall Street"
data-event-click-tracking="TopNav|FeaturedLink-OccupyWallStreet">Occupy Wall
Street</a> <a
href="https://archive.org/details/nsa" title="TV NSA Clip Library" data-event-
click-tracking="TopNav|FeaturedLink-TVNSAClipLibrary">TV NSA Clip Library</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-2">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/tv"
style="background-image: url('https://archive.org/services/img/tv');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/tv">TV News</a></center>
</div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7 col-sm-push-2">
<div class="linx ">
<h5>Top</h5>

<a href="https://archive.org/details/animationandcartoons" title="Animation &amp;


Cartoons" data-event-click-tracking="TopNav|TopLink-AnimationCartoons">Animation &
Cartoons</a> <a
href="https://archive.org/details/artsandmusicvideos" title="Arts &amp; Music"
data-event-click-tracking="TopNav|TopLink-ArtsMusic">Arts & Music</a>
<a href="https://archive.org/details/opensource_movies" title="Community Video"
data-event-click-tracking="TopNav|TopLink-CommunityVideo">Community Video</a>
<a href="https://archive.org/details/computersandtechvideos" title="Computers &amp;
Technology" data-event-click-tracking="TopNav|TopLink-
ComputersTechnology">Computers & Technology</a>
<a href="https://archive.org/details/culturalandacademicfilms" title="Cultural
&amp; Academic Films" data-event-click-tracking="TopNav|TopLink-
CulturalAcademicFilms">Cultural & Academic Films</a>
<a href="https://archive.org/details/ephemera" title="Ephemeral Films" data-event-
click-tracking="TopNav|TopLink-EphemeralFilms">Ephemeral Films</a>
<a href="https://archive.org/details/moviesandfilms" title="Movies" data-event-
click-tracking="TopNav|TopLink-Movies">Movies</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-4">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/911"
style="background-image: url('https://archive.org/services/img/911');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/911">Understanding
9/11</a></center> </div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7">
<div class="linx linx-topped">

<a href="https://archive.org/details/newsandpublicaffairs" title="News &amp; Public


Affairs" data-event-click-tracking="TopNav|TopLink-NewsPublicAffairs">News & Public
Affairs</a> <a
href="https://archive.org/details/spiritualityandreligion" title="Spirituality
&amp; Religion" data-event-click-tracking="TopNav|TopLink-
SpiritualityReligion">Spirituality & Religion</a>
<a href="https://archive.org/details/sports" title="Sports Videos" data-event-
click-tracking="TopNav|TopLink-SportsVideos">Sports Videos</a>
<a href="https://archive.org/details/television" title="Television" data-event-
click-tracking="TopNav|TopLink-Television">Television</a>
<a href="https://archive.org/details/gamevideos" title="Videogame Videos" data-
event-click-tracking="TopNav|TopLink-VideogameVideos">Videogame Videos</a>
<a href="https://archive.org/details/vlogs" title="Vlogs" data-event-click-
tracking="TopNav|TopLink-Vlogs">Vlogs</a>
<a href="https://archive.org/details/youth_media" title="Youth Media" data-event-
click-tracking="TopNav|TopLink-YouthMedia">Youth Media</a>
</div>

</div><!--/.col-sm-2-->

</div><!--/.row-->

<div class="row toprow fivecolumns audio">

<div class="col-sm-
2 col-xs-7 col-sm-push-4">
<div class="linx">
<h5>Featured</h5>
<a
href="https://archive.org/details/audio"
data-event-click-tracking="TopNav|FeaturedLink-
AllAudio"
><span class="iconochive-audio" aria-
hidden="true"></span><span class="sr-only">audio</span> All Audio</a>
<a
href="https://archive.org/search.php?
query=mediatype:audio&sort=-publicdate"
data-event-click-tracking="TopNav|FeaturedLink-
ThisJustInAudio"
><span class="iconochive-latest" aria-
hidden="true"></span><span class="sr-only">latest</span> This Just In</a>
<a
href="https://archive.org/details/GratefulDead" title="Grateful Dead" data-event-
click-tracking="TopNav|FeaturedLink-GratefulDead">Grateful Dead</a>
<a href="https://archive.org/details/netlabels" title="Netlabels" data-event-click-
tracking="TopNav|FeaturedLink-Netlabels">Netlabels</a>
<a href="https://archive.org/details/oldtimeradio" title="Old Time Radio" data-
event-click-tracking="TopNav|FeaturedLink-OldTimeRadio">Old Time Radio</a>
<a href="https://archive.org/details/78rpm" title="78 RPMs and Cylinder Recordings"
data-event-click-tracking="TopNav|FeaturedLink-78RPMsAndCylinderRecordings">78 RPMs
and Cylinder Recordings</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-2">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/etree"
style="background-image:
url('https://archive.org/services/img/etree');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/etree">Live Music
Archive</a></center> </div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7 col-sm-push-2">
<div class="linx ">
<h5>Top</h5>

<a href="https://archive.org/details/audio_bookspoetry" title="Audio Books &amp;


Poetry" data-event-click-tracking="TopNav|TopLink-AudioBooksPoetry">Audio Books &
Poetry</a> <a
href="https://archive.org/details/opensource_audio" title="Community Audio" data-
event-click-tracking="TopNav|TopLink-CommunityAudio">Community Audio</a>
<a href="https://archive.org/details/audio_tech" title="Computers &amp; Technology"
data-event-click-tracking="TopNav|TopLink-ComputersTechnology">Computers &
Technology</a> <a
href="https://archive.org/details/audio_music" title="Music, Arts &amp; Culture"
data-event-click-tracking="TopNav|TopLink-MusicArtsCulture">Music, Arts &
Culture</a> <a
href="https://archive.org/details/audio_news" title="News &amp; Public Affairs"
data-event-click-tracking="TopNav|TopLink-NewsPublicAffairs">News & Public
Affairs</a> <a
href="https://archive.org/details/audio_foreign" title="Non-English Audio" data-
event-click-tracking="TopNav|TopLink-NonEnglishAudio">Non-English Audio</a>
<a href="https://archive.org/details/radioprograms" title="Radio Programs" data-
event-click-tracking="TopNav|TopLink-RadioPrograms">Radio Programs</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-4">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/librivoxaudio"
style="background-image:
url('https://archive.org/services/img/librivoxaudio');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/librivoxaudio">Librivox Free
Audiobook</a></center> </div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7">
<div class="linx linx-topped">

<a href="https://archive.org/details/audio_religion" title="Spirituality &amp;


Religion" data-event-click-tracking="TopNav|TopLink-
SpiritualityReligion">Spirituality & Religion</a>
<a href="https://archive.org/details/podcasts" title="Podcasts" data-event-click-
tracking="TopNav|TopLink-Podcasts">Podcasts</a>
</div>

</div><!--/.col-sm-2-->

</div><!--/.row-->

<div class="row toprow fivecolumns software">

<div class="col-sm-
2 col-xs-7 col-sm-push-4">
<div class="linx">
<h5>Featured</h5>
<a
href="https://archive.org/details/software"
data-event-click-tracking="TopNav|FeaturedLink-
AllSoftware"
><span class="iconochive-software" aria-
hidden="true"></span><span class="sr-only">software</span> All Software</a>
<a
href="https://archive.org/search.php?
query=mediatype:software&sort=-publicdate"
data-event-click-tracking="TopNav|FeaturedLink-
ThisJustInSoftware"
><span class="iconochive-latest" aria-
hidden="true"></span><span class="sr-only">latest</span> This Just In</a>
<a
href="https://archive.org/details/tosec" title="Old School Emulation" data-event-
click-tracking="TopNav|FeaturedLink-OldSchoolEmulation">Old School Emulation</a>
<a href="https://archive.org/details/softwarelibrary_msdos_games" title="MS-DOS
Games" data-event-click-tracking="TopNav|FeaturedLink-MSDOSGames">MS-DOS Games</a>
<a href="https://archive.org/details/historicalsoftware" title="Historical
Software" data-event-click-tracking="TopNav|FeaturedLink-
HistoricalSoftware">Historical Software</a>
<a href="https://archive.org/details/classicpcgames" title="Classic PC Games" data-
event-click-tracking="TopNav|FeaturedLink-ClassicPCGames">Classic PC Games</a>
<a href="https://archive.org/details/softwarelibrary" title="Software Library"
data-event-click-tracking="TopNav|FeaturedLink-SoftwareLibrary">Software
Library</a> </div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-2">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/internetarcade"
style="background-image:
url('https://archive.org/services/img/internetarcade');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/internetarcade">Internet
Arcade</a></center> </div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7 col-sm-push-2">
<div class="linx ">
<h5>Top</h5>

<a href="https://archive.org/details/open_source_software" title="Community


Software" data-event-click-tracking="TopNav|TopLink-CommunitySoftware">Community
Software</a> <a
href="https://archive.org/details/kodi_archive" title="Kodi Archive and Support
File" data-event-click-tracking="TopNav|TopLink-KodiArchiveAndSupportFile">Kodi
Archive and Support File</a>
<a href="https://archive.org/details/softwarelibrary_msdos" title="MS-DOS" data-
event-click-tracking="TopNav|TopLink-MSDOS">MS-DOS</a>
<a href="https://archive.org/details/cd-roms" title="CD-ROM Software" data-event-
click-tracking="TopNav|TopLink-CDROMSoftware">CD-ROM Software</a>
<a href="https://archive.org/details/apkarchive" title="APK" data-event-click-
tracking="TopNav|TopLink-APK">APK</a>
<a href="https://archive.org/details/cdromsoftware" title="CD-ROM Software Library"
data-event-click-tracking="TopNav|TopLink-CDROMSoftwareLibrary">CD-ROM Software
Library</a> <a
href="https://archive.org/details/vintagesoftware" title="Vintage Software" data-
event-click-tracking="TopNav|TopLink-VintageSoftware">Vintage Software</a>
</div>
</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-4">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/consolelivingroom"
style="background-image:
url('https://archive.org/services/img/consolelivingroom');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/consolelivingroom">Console Living
Room</a></center> </div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7">
<div class="linx linx-topped">

<a href="https://archive.org/details/softwaresites" title="Software Sites" data-


event-click-tracking="TopNav|TopLink-SoftwareSites">Software Sites</a>
<a href="https://archive.org/details/tucows" title="Tucows Software Library" data-
event-click-tracking="TopNav|TopLink-TucowsSoftwareLibrary">Tucows Software
Library</a> <a
href="https://archive.org/details/cdbbsarchive" title="Shareware CD-ROMs" data-
event-click-tracking="TopNav|TopLink-SharewareCDROMs">Shareware CD-ROMs</a>
<a href="https://archive.org/details/softwarelibrary_zx_spectrum" title="ZX
Spectrum" data-event-click-tracking="TopNav|TopLink-ZXSpectrum">ZX Spectrum</a>
<a href="https://archive.org/details/doom-cds" title="DOOM Level CD" data-event-
click-tracking="TopNav|TopLink-DOOMLevelCD">DOOM Level CD</a>
<a href="https://archive.org/details/zx_spectrum_library_games" title="ZX Spectrum
Library: Games" data-event-click-tracking="TopNav|TopLink-
ZXSpectrumLibraryGames">ZX Spectrum Library: Games</a>
<a href="https://archive.org/details/cdromimages" title="CD-ROM Images" data-event-
click-tracking="TopNav|TopLink-CDROMImages">CD-ROM Images</a>
</div>

</div><!--/.col-sm-2-->

</div><!--/.row-->

<div class="row toprow fivecolumns image">

<div class="col-sm-
2 col-xs-7 col-sm-push-4">
<div class="linx">
<h5>Featured</h5>
<a
href="https://archive.org/details/image"
data-event-click-tracking="TopNav|FeaturedLink-
AllImage"
><span class="iconochive-image" aria-
hidden="true"></span><span class="sr-only">image</span> All Image</a>
<a
href="https://archive.org/search.php?
query=mediatype:image&sort=-publicdate"
data-event-click-tracking="TopNav|FeaturedLink-
ThisJustInImage"
><span class="iconochive-latest" aria-
hidden="true"></span><span class="sr-only">latest</span> This Just In</a>
<a
href="https://archive.org/details/flickrcommons" title="Flickr Commons" data-event-
click-tracking="TopNav|FeaturedLink-FlickrCommons">Flickr Commons</a>
<a href="https://archive.org/details/flickr-ows" title="Occupy Wall Street Flickr"
data-event-click-tracking="TopNav|FeaturedLink-OccupyWallStreetFlickr">Occupy Wall
Street Flickr</a> <a
href="https://archive.org/details/coverartarchive" title="Cover Art" data-event-
click-tracking="TopNav|FeaturedLink-CoverArt">Cover Art</a>
<a href="https://archive.org/details/maps_usgs" title="USGS Maps" data-event-click-
tracking="TopNav|FeaturedLink-USGSMaps">USGS Maps</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-2">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/metropolitanmuseumofart-gallery"
style="background-image:
url('https://archive.org/services/img/metropolitanmuseumofart-gallery');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/metropolitanmuseumofart-
gallery">Metropolitan Museum</a></center>
</div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7 col-sm-push-2">
<div class="linx ">
<h5>Top</h5>

<a href="https://archive.org/details/nasa" data-event-click-tracking="TopNav|


TopLink-NASAImages">NASA Images</a>
<a href="https://archive.org/details/solarsystemcollection" data-event-click-
tracking="TopNav|TopLink-SolarSystemCollection">Solar System Collection</a>
<a href="https://archive.org/details/amesresearchcenterimagelibrary" data-event-
click-tracking="TopNav|TopLink-AmesResearchCenter">Ames Research Center</a>
</div>

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-5 col-sm-pull-4">
<div class="widgets">
<center class="items_list"> <div
class="items_list_img">
<a
href="https://archive.org/details/brooklynmuseum"
style="background-image:
url('https://archive.org/services/img/brooklynmuseum');"
aria-hidden="true"
data-event-click-tracking="ItemList|ItemListLink"
></a>
</div>
<a class="stealth boxy-label" data-event-click-tracking="ItemList|
ItemListLink" href="https://archive.org/details/brooklynmuseum">Brooklyn
Museum</a></center> </div><!--/.widgets-->

</div><!--/.col-sm-2-->
<div class="col-sm-
2 col-xs-7">
<div class="linx linx-topped">

</div>

</div><!--/.col-sm-2-->

</div><!--/.row-->
</div><!--/#nav-tophat-->

<div class="navbar navbar-inverse navbar-static-top" role="navigation">


<div id="nav-tophat-helper" class="hidden-xs"></div>
<ul class="nav navbar-nav navbar-main">
<li
class="dropdown dropdown-ia pull-left">
<a title="Web" class="navia-link web" data-top-kind="web"
href="https://archive.org/web/" data-toggle="tooltip"
target="_top" data-placement="bottom"><span
class="iconochive-web" aria-hidden="true"></span><span class="sr-
only">web</span></a>
</li>
<li
class="dropdown dropdown-ia pull-left">
<a title="Texts" class="navia-link texts" data-top-kind="texts"
href="https://archive.org/details/texts" data-
toggle="tooltip"
target="_top" data-placement="bottom"><span
class="iconochive-texts" aria-hidden="true"></span><span class="sr-
only">texts</span></a>
</li>
<li
class="dropdown dropdown-ia pull-left">
<a title="Video" class="navia-link movies" data-top-
kind="movies"
href="https://archive.org/details/movies" data-
toggle="tooltip"
target="_top" data-placement="bottom"><span
class="iconochive-movies" aria-hidden="true"></span><span class="sr-
only">movies</span></a>
</li>
<li
class="dropdown dropdown-ia pull-left">
<a title="Audio" class="navia-link audio" data-top-kind="audio"
href="https://archive.org/details/audio" data-
toggle="tooltip"
target="_top" data-placement="bottom"><span
class="iconochive-audio" aria-hidden="true"></span><span class="sr-
only">audio</span></a>
</li>
<li
class="dropdown dropdown-ia pull-left">
<a title="Software" class="navia-link software" data-top-
kind="software"
href="https://archive.org/details/software" data-
toggle="tooltip"
target="_top" data-placement="bottom"><span
class="iconochive-software" aria-hidden="true"></span><span class="sr-
only">software</span></a>
</li>
<li
class="dropdown dropdown-ia pull-left rightmost">
<a title="Image" class="navia-link image" data-top-kind="image"
href="https://archive.org/details/image" data-
toggle="tooltip"
target="_top" data-placement="bottom"><span
class="iconochive-image" aria-hidden="true"></span><span class="sr-
only">image</span></a>
</li>

<li class="navbar-brand-li"><a class="navbar-brand"


href="https://archive.org/"
data-event-click-tracking="TopNav|HomeIcon"
target="_top"><span class="iconochive-logo" aria-
hidden="true"></span><span class="sr-only">logo</span></a></li>

<li class="nav-hamburger dropdown dropdown-ia pull-right hidden-sm


hidden-md hidden-lg">
<div class="container-fluid">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-
toggle="collapse"
data-target="#nav-hamburger-menu" aria-
expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="navbar-collapse collapse" id="nav-hamburger-menu"
aria-expanded="false">
<ul id="" class="nav navbar-nav">
<li><a target="_top" data-event-click-tracking="TopNav|AboutLink"
href="https://archive.org/about/">ABOUT</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|ContactLink"
href="https://archive.org/about/contact.php">CONTACT</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|BlogLink"
href="//blog.archive.org">BLOG</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|ProjectsLink"
href="https://archive.org/projects">PROJECTS</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|HelpLink"
href="https://archive.org/about/faqs.php">HELP</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|DonateLink"
href="https://archive.org/donate">DONATE</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|JobsLink"
href="https://archive.org/about/jobs.php">JOBS</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|VolunteerLink"
href="https://archive.org/about/volunteerpositions.php">VOLUNTEER</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|PeopleLink"
href="https://archive.org/about/bios.php">PEOPLE</a></li>
</ul>
</div><!-- /.navbar-collapse -->
</div>
</div><!-- /.container-fluid -->
</li>

<li id="nav-search" class="dropdown dropdown-ia


pull-right">
<a href="https://archive.org/search.php"
onclick="$(this).parents('#nav-
search').find('form').submit(); return false"
aria-hidden="true"
><span class="iconochive-search" aria-
hidden="true"></span><span class="sr-only">search</span></a>
<div class="searchbar">
<form
class="search-form js-search-form"
method="get"
role="search"
action="https://archive.org/searchresults.php"
target="_top"
data-event-form-tracking="TopNav|SearchForm"
data-wayback-machine-search-
url="https://web.archive.org/web/*/"
>
<input
id="search-bar-2"
class="js-search-bar"
placeholder="Search"
type="text"
name="search"
value=""
aria-controls="navbar_search_options"
aria-label="Search the Archive. Filters and Advanced
Search available below."
/>

<div
id="navbar_search_options"
class="search-options js-search-options"
aria-expanded="false"
aria-label="Search Options"
data-keep-open-when-changed="false"
>
<fieldset>
<label>
<input
type="radio"
name="sin"
value=""
checked
>
<span>Search metadata</span>
</label>
<label>
<input
type="radio"
name="sin"
value="TXT"
>
<span>Search text contents</span>
</label>
<label>
<input
type="radio"
name="sin"
value="TV"
>
<span>Search TV news captions</span>
</label>
<label>
<input
type="radio"
name="sin"
value="WEB"
>
<span>Search archived web sites</span>
</label>
</fieldset>

<a
href="https://archive.org/advancedsearch.php"
class="search-options__advanced-search-link"
onclick="return AJS.advanced_search(this)"
>Advanced Search</a>
</div>

<input class="js-tvsearch" type="hidden"


value='{"ands":[],"minday":"06/04/2009","maxday":"07/17/2019"}'/>

<input type="submit" value="Search"/>


</form>
</div><!--/.searchbar -->
</li>

<li class="dropdown dropdown-ia pull-right">


<a class="nav-upload" href="https://archive.org/create"
_target="top"
data-event-click-tracking="TopNav|UploadIcon"
><span class="iconochive-upload" aria-hidden="true"></span><span
class="sr-only">upload</span><span class="hidden-xs-span hidden-sm-
span">UPLOAD</span></a>
</li>

<li class="dropdown-ia pull-right leftmost">


<a class="nav-user"
href="https://archive.org/account/login" style="padding-right:0" _target="top"
data-event-click-tracking="TopNav|LoginIcon"
><span class="iconochive-person" aria-
hidden="true"></span><span class="sr-only">person</span><span class="hidden-xs-
span">SIGN IN</span></a>
</li>

</ul>

<ul id="nav-abouts" class="">


<li><a target="_top" data-event-click-tracking="TopNav|AboutLink"
href="https://archive.org/about/">ABOUT</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|ContactLink"
href="https://archive.org/about/contact.php">CONTACT</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|BlogLink"
href="//blog.archive.org">BLOG</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|ProjectsLink"
href="https://archive.org/projects">PROJECTS</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|HelpLink"
href="https://archive.org/about/faqs.php">HELP</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|DonateLink"
href="https://archive.org/donate">DONATE</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|JobsLink"
href="https://archive.org/about/jobs.php">JOBS</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|VolunteerLink"
href="https://archive.org/about/volunteerpositions.php">VOLUNTEER</a></li>
<li><a target="_top" data-event-click-tracking="TopNav|PeopleLink"
href="https://archive.org/about/bios.php">PEOPLE</a></li>
</ul>
</div><!--/.navbar-->
</div><!--#navwrap1-->
</div><!--#navwrap2-->

<!-- Begin page content -->


<main id="maincontent">
<div class="container container-ia">

<h1>
Full text of "<a href="/details/internetarchive-encyclis">History of
Internet Archive</a>"
</h1>
<h2 class="pull-right">
<small><a href="/details/internetarchive-encyclis">See other
formats</a></small>
</h2>
<br class="clearfix" clear="right"/>
<pre>This article was downloaded by: [Hutchins, Carol]

On: 28 March 2011

Access details: Access Details: [subscription number 935712424]

Publisher Taylor &amp; Francis

Informa Ltd Registered in England and Wales Registered Number: 1072954 Registered
office: Mortimer House, 37-

41 Mortimer Street, London WlT 3JH, UK


Cover not
available

Encyclopedia of Library and Information Sciences, Third Edition

Publication details, including instructions for authors and subscription


information:
http://www.informaworld.com/smpp/title~content=t9 17508581

Internet Archive

Marilyn Rackley"

11 Library, Harvard University, Cambridge, Massachusetts, U.S.A.

Online publication date: 09 December 2009

To cite this Article Rackley, Marilyn(2010) 'Internet Archive', Encyclopedia of


Library and Information Sciences, Third
Edition, 1: 1, 2966 — 2976

PLEASE SCROLL DOWN FOR ARTICLE

Full terms and conditions of use: http://www.informaworld.com/terms-and-conditions-


of-access.pdf

This article may be used for research, teaching and private study purposes. Any
substantial or
systematic reproduction, re-distribution, re-selling, loan or sub-licensing,
systematic supply or
distribution in any form to anyone is expressly forbidden.

The publisher does not give any warranty express or implied or make any
representation that the contents
will be complete or accurate or up to date. The accuracy of any instructions,
formulae and drug doses
should be independently verified with primary sources. The publisher shall not be
liable for any loss,
actions, claims, proceedings, demand or costs or damages whatsoever or howsoever
caused arising directly
or indirectly in connection with or arising out of the use of this material.

Internet Archive
Marilyn Rackley

Library, Harvard University, Cambridge, Massachusetts, U.S.A.

Abstract

This entry describes the history of the Internet Archive from its founding in 1996
to its two billion page
crawl in 2007. It describes the key individuals and organizations involved in the
Archive's work and the
technological innovations that make the Archive possible, such as the ARC file
format, Heritrix, and
the Wayback Machine. The focus of this entry is primarily the Internet Archive's
Web archiving activities
and collections, but it also briefly discusses the Archive's other activities and
the impact it has had on the
fields of library and information science and the public in general.

INTRODUCTION

The Internet Archive (http://www.archive.org) has cap-


tured the imaginations of information professionals and
the Internet-using public alike. As a pioneer in the field of
Web archiving, it offers the largest and broadest collec-
tion of historic Web sites in the world and can lay claim
to a substantial collection of other digital cultural heri-
tage objects as well. Although often best known for its
Wayback Machine, a portal to its Web site archives, the
Archive and its cofounder and director, Brewster Kahle,
are active proponents of making all human knowledge
available to everyone in digital formats and of preserving
that knowledge for future generations; and the creation of
the Wayback Machine is only one aspect of their pursuit
of this goal.

Indeed, the Internet Archive has played both a direct


and an indirect role in preserving human knowledge. It
has played a direct role through its collecting activity and
technology development and it has played an indirect role
through its inspiration of similar collecting and preserva-
tion efforts. Moreover, the Archive collaborates with
dozens of other institutions engaged in similar pursuits
and promotes policies and practices that increase the free,
public availability of cultural heritage materials.

This entry describes the history of the organization,


including its mission and policies; the technology it uses
to achieve its mission; its collections and services; and the
impact it has had on the areas of Web archiving and
digital preservation in general.

HISTORY
Mission

The nonprofit Archive was founded in 1996 as a response


to a rapidly growing problem — the disappearance of

content from the World Wide Web. Estimates of the ave-


rage life span of a Web site vary, but the Internet
Archive's Frequently Asked Questions page gives 77
days as the figure. [1] As the Internet became increasingly
prominent in all areas of human activity, it became corre-
spondingly important to preserve it. Archive cofounder
Brewster Kahle likened the potential disappearance of
digital content from the Internet to other great cata-
strophic historical and cultural losses; "[mjanuscripts
from the library of Alexandria in ancient Egypt disap-
peared in a fire. The early printed books decayed into
unrecognizable shreds. Many of the oldest cinematic films
were recycled for their silver content. Unfortunately, his-
tory may repeat itself in the evolution of the Internet —
and its World Wide Web." [2]

To combat this problem, Kahle and Bruce Gilliat, two


technologists, established the Internet Archive in San
Francisco, California. They made access to and preserva-
tion of Web content the organization's primary mission,
and their vision for the organization was ambitious, aim-
ing at nothing short of an archive of the entire World
Wide Web. Because no other individual or organization
had previously attempted to archive Web content on such
a large scale, the Archive needed not only to create a vast
collection of this ephemeral content, but also to invent the
very means by which that content could be collected and
subsequently preserved.

In June of 2007, the state of California recognized the


Internet Archive as a library, making it eligible for Fed-
eral funding and symbolically allying it with institutions
that have a long tradition of preserving and providing
access to knowledge. The Internet Archive encourages
and promotes this association by, for example calling user
accounts "library cards." The Archive shares the ideals
and aspirations of libraries and other similar organiza-
tions. Early in its history, the Library of Congress tapped
the organization to be its partner in creating collections
based on important events in U.S. history, such as the

2966

Encyclopedia of Library and Information Sciences, Third Edition DOI: 10. 1081/E-
ELIS3-120044284

Copyright © 2010 by Taylor &amp; Francis. All rights reserved.


Internet Archive

2967

2000 national elections and the terrorist attacks of


September 11, 2001, conferring prestige on the Archive
and signaling a shared agenda between the collaborators.
However, the Archive also sets itself apart from tradi-
tional libraries through its unique collections. Since its
creation, the Internet Archive has made "universal access
to all human knowledge" the foundation of all its activ-
ities. Because it is neither a collection of material selected
to serve the needs of a specific community nor a collec-
tion preserving the historical memory of a specific orga-
nization, it has been said that "the Internet Archive might
be described as a true archive, seeking to collect and
preserve the entire Web, past, present, and future. " [31 In
the years since its foundation, it has grown from an orga-
nization that archives Web content to an organization that
promotes the free exchange of information in a variety of
forms and formats. It does so by making content available
online in digital formats and by developing tools and
services that help others to make content in their own
collections more widely available. Fig. 1 shows the home
page of the Internet Archive in late 2007. The content
highlighted on the home page gives an indication of the
variety of materials in the Archive's collections.

Founders and Significant Contributors

The individual most commonly associated with the Inter-


net Archive is Brewster Kahle (b. 1960). Kahle was one
of the founders and still leads the organization as its
director. He began his career in the technology industry;
after receiving a degree in Computer Science and Engi-
neering from Massachusetts Institute of Technology
(MIT) in 1982, Kahle 's first position was at Thinking
Machines Corporation, a supercomputer manufacturer in
Massachusetts. At Thinking Machines, he participated in
the development of Wide Area Information Servers
(WAIS). Kahle left Thinking Machines to start WAIS,
Inc. in 1992, which AOL then purchased in 1995.

Since selling WAIS, Kahle has brought his background


in technology and business to bear on the issue of the role
of technology in society. Kahle characterizes the Internet
as a democratic communication medium and he directs
much of his time toward defending and promoting
what he views as an egalitarian medium. In 2005, the
American Academy of Arts and Sciences elected Kahle
as a fellow. The Academy and its members promote ser-
vice, scholarship, and public engagement in a wide range

&amp; E* £W* HgPV fcdfeivfe l«fc ttfe

fCl-

r4AS* Inuitfi nroncMariKtiU


BWkfScinrtiiiq M K* PubHitfrFun.*.

Brothers .ml S&lt;siers : fi.hiwlriy

Lrtllf Sls-i* umihtl lhi-1 he didnt h*H* so (I

•..iblmgs Bui when tol drtatAt lh_| lis: t\b\i

PI

HKHM!

l&lt;W

| T&lt;*eM»B_k |i_m«_i

Antwrrtrtui Ussrc «■ 1

_____

Th* HftrMt trttrrt _ buttflg _ dfrf*! _r*y M


kfertrl __-i _■_ Other ■___■_! _1jf__a it) t_p_i It
L*S ■ pipor *"*^. ■** (■*■■* *W KC9U »
AlUch &lt;A Ihc 50 Foot Wonwft - 1i&gt;-ii

Aver_g_r__ig l!5£x£z£r£!l

ijv Li*e fflllSiC

XT wmwi

Hal _«neig rt Rum Live al Rancho Hloalo


____

, S*pt*rn..i 07 . _G_8 Rancho Ni.iiio Niiiii..


CASH l 01 Swig A.Sw*y02 Ntk Kim &gt;
03 IjQhian ...

Brt Light brt -r 1 2 G*\ | »ie* or&gt; 2_0B-_^_S


Avt&lt;*g*r*to9 T^£rA£rA

Th_ D-Wivn Brethw- Lrw rt Sirl.v isJand


B&lt;ew-i^ Cawmiij on _-0&amp;-_-_7

__udi_&gt;

_ 3i l ,663 i_-_-_n?i

LF4B7Y_iBil-

WKh Yemaja, Juslm Rrivffs mod


expenmenlal tffarl Id dale, he ha. reached for
hoyDnd

i Aw*g*r*tw_ ft£r£r&amp;&amp;

__i n_a_iwj«ij

BtQ&lt;Hl* CtDL__C___D
The MMJMBS I _____&lt; ■_____!

Bi .- cloth Mi b«_id. blocked m gold and

Aw««gtrtbn» i_^IJ^£nSl

Clwratun awlMHrtici * dthwi _f th»

CMholn faith

Altt pMt bffU. a po*i li-y iwilrni lu 4 r

m.Matt Mural mt-m fan urnn-si m ( ■; m

F&gt; while w* anal ii

Pe "oie H-en&gt; mtdi a mas -againi! w f C -am

the fai rooms

lobBnewwdsmrlh

lObrHnwaidtmtth

t'iral«hjlDp^.J

Hi, rnmultt _qi&gt;


1 8 minutes apa
_Q rr.rr.i.-y - agd
33 minutes ago

Fig. 1 The Internet Archive home page (http://www.archive.org) on September 23,


2008.

2968

Internet Archive
of disciplines; and Kahle is an active public participant in
contemporary discussions of technology's impact on edu-
cation, the economy, culture, and the law. In one demon-
stration of his commitment to open access to information,
he has supported challenges to the extension of copyright
protection and to the copyright status of orphan works,
acting both through the Internet Archive and as an indi-
vidual. (The Archive filed an amicus brief in the case of
Eldred vs. Ashcroft, which challenged the constitutional-
ity of the Copyright Term Extension Act of 1998. Kahle,
with Richard Prelinger, also brought suit against the
United States to challenge the constitutionality of U.S.
copyright law. Both cases were heard by the Supreme
Court, and in both cases, the plaintiffs were unsuccessful
in loosening copyright restrictions.)

Bruce Gilliat founded the Internet Archive and Alexa


Internet with Brewster Kahle. Gilliat has remained at
Alexa and is currently the Chief Executive Officer.
Although Gilliat is not directly involved with the Archive's
activities, through Alexa Internet, he continues to work on
improved access to Web content and helps preserve it via
donations of content for the Archive's collections.

Another individual who has made important contribu-


tions to the Archive is the current president of the board
of directors, Rick Prelinger. Prelinger is now best known
for his archive of over 60,000 ephemeral films, a collec-
tion which he began in 1983, but he is also a filmmaker
and writer. The collection of educational, promotional,
amateur, and industrial films bearing his name is the larg-
est of its kind. In 2002, the Library of Congress acquired
the collection, but Prelinger continues to collect in the
area. Like Kahle, Prelinger is an enthusiastic advocate
for the preservation of and public access to cultural heri-
tage materials.

A key institutional partner in the Internet Archive's Web


archiving activity is Alexa Internet, a for-profit organization
that has provided the content for the Archive's Web collec-
tion since the Archive's inception. Alexa Internet was
founded in late 1996 to catalog the Web. The company
collects, aggregates, and analyses data on Web content and
Web usage. It gathers information on Web content through
its Web crawls, which run continuously, and it gathers
information on Web usage from the use of the Alexa Tool-
bar. The company has sold this information to other com-
panies, including other Internet application creators, such as
Netscape. Alexa Internet has been the defendant in more
than one lawsuit alleging violation of privacy laws in its
data collection and storage practices. Although Amazon
purchased the company in 1999, it continues to donate
copies of its Web crawls to the Internet Archive.
TECHNOLOGY

The work of the Internet Archive requires a specific set of


tools for the capture, storage, and provision of access to

its collections. Because many of these tools did not exist


in 1996 when the organization first began its work, it has
been responsible, in many cases, for designing and creat-
ing these tools itself, either on its own or with partners.

Capture Technology

The Internet Archive currently uses the open source


Heritrix Web crawler to find and capture Web sites. The
Archive began developing Heritrix in 2003, using the
Java programming language. Prior to 2003, the Archive
relied solely on Alexa Internet's proprietary crawler to
collect Web files. Heritrix's development has involved
the Archive, the International Internet Preservation Con-
sortium, and other partner libraries and institutions.

The basic requirements for the original crawler used by


Alexa Internet and the Internet Archive have remained
unchanged even while specific crawlers have progressed.
First, the crawler must be as polite as possible, obeying any
instructions on crawl behavior found in a site's robots.txt
file and avoiding aggressive crawl behavior that might dis-
rupt other site traffic. Second, the crawler must be able to
function on more than one machine at a time. Thirdly, the
files captured from the Web must be aggregated into larger
files for ease of management and access. [4]

Beginning with specific Universal Resource Locators


(URLs) known as seeds, the crawler finds documents
(files) available over the Internet and downloads them to
the Archive's servers. The crawler parses the files looking
for references (links) to other files represented by URLs
and then adds those URLs to the list of files to retrieve. If
these reference paths are relative, the paths are made
absolute before being added to the list. As the crawler
retrieves each file in the list, it repeats this process of
searching for references to additional files and verifies
that the additional files have not already been captured.
This process is continuously operating to acquire multiple
snapshots of the Web as it grows and changes.

However, capturing the Web in snapshots, which take


place at varying intervals, cannot result in a complete
picture of the Web over time. Files and content added to
a Web page and taken off in between Alexa crawls of that
page will never be present in the archive. Because of the
method the crawler uses to find new documents to ret-
rieve, files to which other files frequently link will be
captured more often than files to which other files rarely
link. The number and timing of the Internet Archive snap-
shots available for any given file collected from the Inter-
net will vary and will often present only an incomplete
picture of any changes affecting that document over time.

As Heritrix and other crawlers have improved over the


years, their ability to capture larger and more complicated
Web sites has improved, as they become better able to
parse the references in each file. Future releases of Heri-
trix will continue to improve crawl results and will inc-
lude further duplication reduction, increased control for

Internet Archive

2969

curators, and advanced prioritization features. [5] Nonethe-


less, with the technology currently available, much of the
Internet remains hidden from Web crawling robots, and
thus outside of the Archive's collecting scope. Databases
cannot be captured by the Archive using Heritrix, nor can
it collect files that are password protected. In addition, Her-
itrix has difficulty parsing links found in many of the more
complicated Web programming technologies, such as Java-
Script and Flash, which are becoming increasingly popular
with Web designers. A user of the Internet Archive's Web
collection may find that only the home page of a site is
available in the archive because the site's navigation tools
were built in JavaScript, which the crawler was not able to
parse, and therefore the files making up the rest of the site
were not added to the crawler's list of documents to retrieve.

Storage and Preservation

In order to efficiently store the billions of files that make


up the Web pages in its collection, the Archive created
the Archive File Format (ARC), which uses the .arc ext-
ension. The requirements for this file format were that the
captured files would be self-identifying, that it could store
files retrieved via any type of network protocol; and that it
would allow the aggregation of multiple archive files into
larger files. [6] If the captured files were self-identifying, there
would be no need for separate index files to assist in the
retrieval of each individual file. If the file format could han-
dle files retrieved through different protocols, it could handle
any files available through the Internet, and not only those
using the hypertext transfer protocol used, for instance, on
the World Wide Web. Finally, if the system was not required
to handle billions of small files, it would be able to use its
storage and retrieval resources more efficiently.

Satisfying these requirements, each ARC file contains


multiple files retrieved during a crawl, each with a header
containing elements of metadata about the file and its
retrieval. These elements include the file name (its URL),
its size, its content type, the date and time of retrieval, and
the name of the organization that retrieved it. The Archive
stores the products of its Web crawls in ARC files holding
approximately 100 MB of data each. Because of the ease
with which archived files can be found within an ARC
file, the files needed to view an archived Web page or site
can be spread among multiple ARC files.

For the first 3 years of its existence, the Archive used


tape to store archived files. However, the tapes were un-
able to efficiently handle a large volume of access requests
and the need for a scalable storage solution became appar-
ent as the number of access requests increased. Conse-
quently, the Internet Archive designed the Petabox to
store and process large amounts of data. As its name indi-
cates, the Petabox can hold one petabyte of data, which is
the equivalent of a million gigabytes. To house its massive
archive, the organization needed inexpensive and efficient
storage that was easy to maintain and would support the

Archive's back-up and mirroring system. After develop-


ing the storage, the Internet Archive has let a third-party
reproduce it for use in many other organizations.

One of the challenges to the long-term viability of any


archive is the preservation of its collections. The Internet
Archive recognizes the need to maintain copies of its
collections in locations that are geographically separated
from each other. Mirror sites exist in Alexandria, Egypt,
and in Amsterdam, the Netherlands.

Access — The Wayback Machine

The Wayback Machine (http://www.archive.org/web/


web.php) is now almost synonymous with the Internet
Archive, but it was not until October 2001 that this simple
browser-based user interface was introduced as the pri-
mary way to access the Archive's Web collections.

The Wayback Machine interface allows users to find a


specific instance of a Web page. A user enters a URL into
the Wayback Machine search field and is "taken back" to
a results page listing the dates on which that page was
captured (see Fig. 2). The user then selects the version she
wishes to view by clicking on any available date.

Fig. 3 shows one of the earliest iterations of the


Archive's home page as seen in the Wayback Machine
before any collections were publicly available.

Fig. 4 shows how the home page has changed after a


few years of the organization's existence, with a sizeable
collection of archived Web sites freely available.

Additionally, the Wayback Machine allows users to


browse the Internet through space and time from any
given starting page. One of the most significant features
of the application is its ability to rewrite hyperlinks to
refer back to archived files as opposed to "live" files; links
are rewritten when a user requests a page from the
archives and not when the files are archived. With this
capability, the user is able to link from the archived ver-
sion of one page to the archived version of another page
through the page's original links, replicating the intercon-
nectedness of the "live" Web in the archive environment.

An example from the Internet Archive's home page will


serve to illustrate how the Wayback Machine works. The
URL for an August 15, 2000 version of the Internet
Archive's home page in the Wayback Machine is http://
web.archive.org/web/20000815054438/http://www.archive.
org/. In addition to an indication that the user is in the
domain of the Internet Archive's Web archive collection
(http://web.archive.org), this URL includes the original
URL for the resource being displayed (http://www. archive,
org), as well as the date and time of its capture (August 15,
2000, 5:44:38). If a user clicks on a link from this instance
of the home page to another page in the same site, such
as the "About" link, she will be taken to a page with
a URL of http://web.archive.org/web/20000815072836/
www.archive.org/about/index.html, that is to an archived
"About" page. The URL allows the user to know that they

2970

Internet Archive

Q - C X (.&gt; { ■ WrtJ-t.«!«..m*»*nWrf»*w&lt;m

"

S3*
&gt;**

layBaciii

fui

*- 1 Toke Mb Sack ■

^^ffH!,^^LB

Searched for (L^jMH&amp;fcftJHa

2047 Results

Note fifni duplienij ife not ihown, sj»_n

■ dawrtis nhsn s4i wis UEHJatid.

F4M*nal trprWlT bee»na* available bws &amp; month* atte. Mlkttun |

Search Results for Jan 01, 1996 - Mar 27, 2008

1996

1997

2 pages
Oct 11 1997 '

fut '1 1M7


Jan 13 V*»
Jan 33 t^ft

1999

10 pages

Jin 16 1999 '

jjajusaa

FtflP?,, TO

Frt.m 1MB

Fib 10 1999

m _b

2000
10 pages

Muca am •

Mir 03 3033
Mar CM ?««

Aj_e_m_-

May 20 arc
Jul. 10 3000

Ml. OB 1999

'VI 11 I"* '

».,?i jnrn

Jan 22 2000
Aua 15 7000 ■

2001
40 pages

Jan IB 2301 •
Jan 1° rnril
rjM.' JBl •
E_j___.
Flh24 2001 •
Fib 76 Ml
«l'.a',.«M

2002

140 pages
Jin 18 2002 •
Jan 34 :«i? •
rVhrs 300? ■

M"&gt;i ™? ■

AbfOl 2002 •
May 27 300? '

Sj_3__flO.
Ocll7 2OD0 ■
OtllB ?om
Qciii) arm
niffli.MiT

Dk 05. 2000

Pwoe ?ooo

*»oiam

Apr PA Tttll
MuM 2001

httJj:^f&gt;«b.l«^*fl'Qf*wW7D031KI5e75M5littc'y^«WfT.ifchs*.tfg^

Sip 02 2031
Sip OB Mil

Binns aim

Sin 06 2031
Sip 07 ?D3I
Sin OB 3ml

S""» ami
Sin 10 2031

~L&lt;1-C!:l
Stf 1,3, MM

S-n » ?mi
Sip 15 2031

sip is 3031

S" '" ™'

Sip 19 203)
W&lt;» 10 2001

Hw.'S.JBI

"" IB 70111

WW 28 2001
Mb- 29 3001

t_J0_S_
Ohuoi ma

.kin (14 ?m? *


AUj02 2O02 '
Au g, 03 200?
Aug IK 300?
Aug (17 2(103 *
AiialTB MM

fl _JBJ DOG

Ajig 10 3nfl?

*"° " tup


Aug I J 2002
Alia 13 Ml?

■*uan,an2

Aug 11 2mj

2003

37 pages
Jin 26 2003 •

Fiboa 3003 -
Ffboj , xm •
Fi-i. 17 ?rtn •

Fib 17 2003 •

ttK_fijafi •
Ma.?n an* •
«""' Tnrw

AH 19 2003 ■
Aa.?l 3003 '

m.v. ,aro ■

Ai.3li.3HW

Ml, 23 2003 '

2004
240 pages
Jan 03 2004
Fib 02 2004 •
Fib 10 30TH. -
Fib II mi •
Fib 20 2004 •
Ml. 19 3001 -

An. 01 TfTU &lt;

■Hi"" aim ■

.,-.■■_ Z_ '
'70 thiw

____&gt;j •
am OB. 2J_j *

MiyOS 3001
M| T 09 2004

May 14 2001

HojajUM

"■"^ ™l
Jan 03 2004 •
■limlW ?rm •
Jjl OS 20U *

■linim 7CIU

All] 16 2002 Jul 24 2003 ■ Jun 10 2004 '


Aug 30 300?

A"!?' TTTO

Aug 23 2002 •

Aug , 24 300?

*"°77 7TH7

Aug 28 2002

Aug ti 200?

Atlg 3.1 30 0?

Ran 01 3(iT?

Sip 02 2002

Sip 03 3003

sgim Ton? ■

San re amj

gf"7 ""

.Mas ?ora ■
.H7R Tom ■
Jul 3 1 2003 •

AiHjO? 3003 ■

Alias Torn ■
SmJi Trim ■

Pel 01 2003 •
On IS 3003 -
t&gt;aM Ma-
bj___2_B-
Ha* 26 2003 •
Ha. 25 3003 ■
Ow. 17 2003 ■
Dm 30 2003 '
Dm 26 2003

lim 13 3031 ■

Jul. 13.3004

Ml 11 TTH
ilin 1*i Tin

Jl" 1B IQ34
Jun 16 2004

^apiyi Tim?
q a n in 7i¥i7

ji,n I? , apa ■
■''" » ™&lt; '

Jun 20 2004 ■

iriB?? TfnfM *
■jura j^ , ?rti4 ■

Inn 71 ^fin^ '

2005

633 pages

Jin 01 2005 *
Jin 02 2005 -
■'in I P. Wfi
Jan 04 3005'

JIOJSJJJB-
Jan OS 3005 *
Jan IT) 3TIM ■
Jan 10 3ms «

JjnJ!L20JE&gt;
l.„U .../■ '

Jan 1? 3005 "


Jan 17 7tirfi •
Jan 13 2005 *
Jan 13 3005 '

Ja.iJ4.2We-

Jan 1&gt;i Tnffi ■

Jan 16 2005 •

Jan 17 3005 -
Jan 1? 3005 ■

Jan IB TTtFi •

Jan 19 2035

2006
413 pages

JinOI 20OE •

Jaw 01 3005 *

Jl« 02 2005 •
JanC? 3005 ■

■lanll? TTITfi '


J|»C3 2005 '
Jaw 03 3006 *
Jan,(U 300fi ■

Jin 04 2006 •
Jan 04 3005 ■
Janffi 30TB '
JjIlj35_200Ji •

Jan 30 2005 '


Jan 71 3005-

Jan 21 2005 •
Jan 23 3005 *
■ lap 33 .Trtn •
Jan n 7ITC •
JJUL2L20JG'
Jan 24 3005 •

Jan a. OT5 "

■linTH.jmW

Jan 27 3005 -
Jan 73 30M -
JanTn TniTi *
Jan 30 2005 •

Jan 31 3005 *
Jan tl AIB -

, F afai n .. Trirw

Jan 05 3006 '

Jan OS 3006 ■
JinOE 2006 •
JaftOS 3006 '

■laftfli TmB '


Jan 07 2005 ■
Jan07 3005 ■
JjuJJZjafi'
.Ia n n7 7mB ■
Jin 07 2006 •
Jin 07 3006 '

JmSLSSS&amp; "
JjnJJUBfi •
Jin OB 2006 '

2007

130 pages

Jan 02 2337 •

Jar.04 3007 •

jB.flv xta •

J«.07 70(17

;■' ujodz ■

Jan 11 X07 ■

Jar. 13 7(117 •
Jan 15 2007
Jin IS 7037 ■
J«Tl«,&lt;W
Jan 1R 70X17 •
J«l2l 2JJ7
Jan 35 3007 ■
Jan 7F, 3077 •
■Ian 77 70X17

EiUl 2PJE '

Fib 02 3O07-
Fib 03 3f«7
FibBS 3IW
FlhOO 2007
Fib 12 3007 ■

EitLU^SZ-

2008

10 pages
taJUODJ

Jin04 3003
Jan OS ?0fil
Jan 11 3HH •
Jan 17 2003
Jan IB 3009
Jan 10 3f1ffl
Jan 70 JTIH
Jan 21 2003 •
Jan 25 JOBS-
Fab 01 30TB
Fahffl TmB
Fib 15 2006
Fib?? ?00S

Frt.a,a»

Fabja 7(im
MM 07 2006

;■■■&gt;■. ■■.J-

Flb 19 2007
[ -n I'- ■.■:&lt;■!/ •

tsSUi-MZ'

Frt77 MO

Mil 01 2007
Ml, 04 3007

WarW.?M *

Ma, 06 2007-
Ml, 07 2O07 •

Ma, OS 3007 ■
M.«_il 200.7.
M«K MS!

la w in m *
, - .;- i: ■
Ma. 32 3007 •
Ma. 7» MTt ■

■ W» i. M..?xin7 . '

Fig. 2 The Wayback Machine results page for the URL http://archive.org (The
Internet Archive home page) on September 23, 2008
(http://web.archive.org/web/*/http://archive.org).

are not on the "live" Web because she is still in the web.
archive.org domain, that she is looking at a page that was
originally found at http://www.archive.org/about/index.
html, and that the page was captured on August 15, 2000
at 7:28:36. When the Archive's home page was displayed
in the Wayback Machine, the "About" link was rewritten
to point to the instance of the original link that is closest
in capture time to the instance of the home page being
displayed.

Similarly, if the user chooses to follow one of the links on


the "About" page to a page on another site, such as the link to
the UNESCO page about the new Library of Alexandria, she
would be taken to a page in the archive represented by the
URL http://web.archive.org/web/20000818203230/www.
unesco.org/webworld/alexandria_new/. The page dis-
played then would represent a different space and time
than the page from which the user first began navigating
in the Wayback Machine, because it was originally from
a different domain and was captured on August 18, 2000
at 20:32:30, approximately 3 days after the Archive's
home page. Thus the user is able to navigate through the
archived Web just as on the "live" Web, traveling from

one domain to another, but she is also able to navigate


through time by viewing files captured at different times.

The Internet Archive presents snapshots of files cap-


tured from the Internet as the Alexa crawlers follow avail-
able hyperlinks. This capture method prevents the Archive
from presenting a site, domain, or other group of files as
they existed at one time. Of the examples given above,
none of the pages were captured at exactly the same time.
Consequently, in this example, the user would be viewing
only an approximation of what the Internet Archive site
and the sites it linked to looked like on a specific date in
August 2000 at a specific time.

While the primary means of retrieving archived Web


sites through the Wayback Machine is by URL, the
Archive is developing more advanced search options for
the Web site archives, in particular a full-text search en-
gine. Currently, there is an advanced search interface for
the Web archive allowing users to limit their URL-based
search by date, and a structured search interface is avail-
able for other portions of the Archive's collections,
making possible searches by such fields as, date, creator,
collection, and media type.

Internet Archive

2971

tft E* vpm h&amp;cy Be*"*** to* (Mi

mjm - C 1: El Htt ff ;/^^.af £ J™.«pJ»^M»^aMl?^3|l«B:/frww-»H*™-P^r

■ S3-

Tl ..W INTERNE

T ARC

H
V

i .1.1 . m

Building

a digital library for the future

1Mi0

AcknpwfctArrainili
Board Matiben

Fmding Llj

1:, IJ:- McW!

Webmasters

Our Mission

Internet Archive w
ceDecong and storing
pttfcbe materials fcrorn the
Internet such as the W«M
Wide Wet. Wetoews. aid
de'.vnloadab]e software
which have been donated
by AIckj Interne!

Tbx Archrec wiH provide


historians, researchers,
scholars, and other*

access to this Vast

eoBeceton of data (teaching


ten terabyte*), and ensure
the longevity of ehs
infsimanon
Fer mare informatbon
about our philosophy and
objectives, please read
^Ajvrttg jtw nfrf.b y the
Archive's (■: under.
Brewster Katie .

Vet) 1&gt;ie '96 PS


P: c n jtajtaj £,e rtiTn Wee,
Archive "Thus archive was
created m affiliation with
the Srajhs organ InsMuhon

T mVtxt h Miy

Fig. 3 The Internet Archive home page on December 1 1 ,


1997 121 1 122953/http://www.archive.org/).

1997 as seen in the Wayback Machine (http://web.archive.org/web/

Like Heritrix, the Wayback Machine is an open source


application, created by the Internet Archive, but also used by
other institutions engaged in Web archiving activity. One
significant drawback to the Wayback Machine is the lack of
clues indicating to the user that she is viewing an archived
Web page. The only way to identify the instance being
presented is through the URL, and users must pay close
attention to the URLs as they browse through the collection
in order not to lose their way (or even in some cases, to find
themselves suddenly on the "live" Web). However, brows-
ing the historic Web through the Wayback Machine is only
one way of accessing the Archive's vast collection. Users
interested in applying data mining techniques to analyze
older Web pages are permitted to create their own applica-
tions for viewing and exploring the collection.

COLLECTION AND SERVICES


Web Pages

The Archive's first and most significant collection is its


collection of archived Web pages. The Alexa Internet

crawler continually crawls the Internet, capturing new


files and new versions of older files. When the Archive
accessions these files, it is collecting documentation of
the changes in individual Web sites and pages over time
and changes in the Web as a whole.
This collection has generally been characterized by its
breadth, rather than its depth, especially in its earliest
accessions. The characteristics of the Web collection are
the product of the Archive's philosophy regarding the
nature of the Web and the manner in which the material
was collected. According to Kahle, the Web as a whole is
"a self-documenting, self-cataloging machine"; the con-
tent of the Web includes within it its own description and
the means of its discovery. [7] Thus, the interconnected-
ness of all Web content necessitates an archiving strategy
that aims to collect and preserve as much content as
possible, without regard to the intrinsic merit of any given
site. This view of the Web does not consider each file,
page, or site as a separate entity to be specifically captured
in its entirety, so, for example, the Archive would not
point its crawler to a specific site, such as http://www.
archive.org, and attempt to gather all the files found in that
domain and not the files found on other domains.

2972

Internet Archive

Fig. 4 The Internet Archive home page on August 15, 2000 as seen in the Wayback
Machine (http://web.archive.org/web/
20000815054438/http://www.archive.org/).

Instead, the crawler would be instructed to start with


that URL as a seed and to follow all links found within
the files on that domain even if they pointed to other
domains. The crawler instructions might also specify a
limited number of files to capture from any one domain
or simply limit the amount of time spent retrieving docu-
ments. In this way, the crawler would collect files from as
many domains as possible without ever necessarily col-
lecting any domain or site in its entirety. The result would
be a representation of the many sites existing on the Web
at a given point in time, but would not be an exact replica
of every file available through the Internet at that time.

Another factor limiting the content of the archives is


the simple fact that the crawler must be aware that a site,
page, or file exists to be able to retrieve. Alexa Internet's
source of information about the existence of these docu-
ments comes from individuals using the Alexa Toolbar
and from the links within documents already captured. If
the Alexa crawler has not been made aware of the exis-
tence of certain files, it does not crawl them and they will
not be added to the Archive's collection. Indeed, research
has suggested that the Alexa crawler's link discovery
method has resulted in Web pages hosted in certain

countries to be considerably underrepresented in the


Archive's collections. [8]

The Archive's Web collection has changed over the


last decade as the technology needed to create it has
improved, but the essence of its collecting policy remains
the same. The organization aims to preserve as broad a
snapshot of the Internet as possible. It does, though, rec-
ognize that it potentially collects materials, especially
Web content, whose owners and copyright holders may
not want them included in the Archive. Regarding such
content, the Archive's policy is to remove the files from
the collection at the author's or publisher's request.
The Archive's crawlers also respect any instructions
(found in robots.txt files) from Web site owners not to
crawl their pages, and blocks future access to any such
pages captured before the instructions were added. The
Archive relies on its "polite" crawler and its "opt-out"
policy to protect copyright owners and does not actively
seek permission to collect and preserve the objects in its
collections.

The organization does not appraise the materials it


collects and does not undertake to censor any materials
that viewers may consider objectionable.

Internet Archive

2973

Although by the end of 2006, the Archive had already


collected over 85 billion pages, in June of 2007, it began a
two billion page crawl to create a comprehensive, global
snapshot of the Web at that time. This project was sup-
ported by a grant from the Mellon Foundation. Prior to
the crawl's commencement, cultural institutions from
around the world were able to submit their URLs for
inclusion. The "Around the World in Two Billion Pages"
collection was made publicly available at the end of 2007.

Despite the Archive's ambitious collecting goals, their


achievement has been limited by the capture technologies
and the associated methodologies the organization uses to
collect Web-based documents, as well as its own collect-
ing policies. Many documents simply cannot be identified
and crawled by the Alexa spider. The Web collection
does not include Internet-based databases, e-mail, discus-
sion forum postings, and other types of content. Other
documents that are identified and can be crawled are
archived only when and because other documents link to
them and not when their content is changed. Some docu-
ments will never be captured because of the file owner's
instructions to the Alexa crawler not to capture the
file, while still other documents might be retroactively
removed from the archive to respect the wishes of copy-
right owners. Consequently, the Archive can neither truly
reproduce an accurate and complete version of the Inter-
net as it existed at any given point in time, nor can it
provide even one version of all the documents available
on the Internet since the organization's collecting began.

Furthermore, while capture and replay technology con-


tinues to improve at a rapid pace, certain types of dynamic
Web page scripting, especially client-side scripting, such as
JavaScript, are difficult and sometimes impossible to prop-
erly capture and replay. Because of these constraints, ar-
chived Web pages and even entire archived sites may not
always accurately reflect the "look and feel" of the originals.

Archive offers to store and provide access to these files to


further its goal of promoting and, to the extent possible,
providing universal access to all knowledge.

Any user may add files to the collections through an


Internet-based interface. Unlike traditional archives, the
Internet Archive does not play an active role in choosing
accessions to its collections, nor does it make any claim to
guarantee the authenticity and reliability of these acces-
sions. It also does not attempt to control access to or use
of the materials.

Other Services

In 2005, the Internet Archive launched a service


"designed for institutions that have been mandated to
preserve content from the public Web but do not have
the IT infrastructure or technical staff necessary to meet
that mandate at the current time." [9] This service, named
Archive-It, currently has over 50 partners, including state
governments, universities, and nonprofit organizations.
For an annual fee, partners are able to use Archive-It's
Web-based application to create and manage their collec-
tions. The Internet Archive maintains the files making up
the collections and provides access to them through a
collection-specific version of the Wayback Machine.

The Internet Archive is also actively engaged in pro-


moting access to and use of its content, especially in
under-served communities. One notable example is its
bookmobiles. Launched initially in the United States in
2002, the Internet bookmobile traveled across the country
giving away small paperback children's books printed on
demand, using the Archive's collection of digitized pub-
lic-domain classics. The bookmobile was the foundation
for a separate project, Anywhere Books, which created
bookmobiles in Egypt, India, and Uganda. However, the
Internet bookmobile is not currently traveling.

Other Digital and Digitized Content

Since 1999, the collecting scope of the Archive has in-


cluded other forms of digital content, in addition to Web-
based digital content. The Archive now boasts of over
300,000 digitized texts, 200,000 audio files, 100,000 video
files, and 30,000 software-related materials. Like the Web
archives, the materials in the other collections cover a wide
variety of topics. While some materials are educational,
others are closer to pure entertainment. Indeed, one of the
most popular topics, if the Archive's users forum is any
indication, is the band The Grateful Dead.

Much of this material, though hosted at the Archive, has


actually come into the collections through donations from
individuals and other institutions, including, for instance,
the long-standing Project Gutenberg, which offers plain
text versions of public domain books and some audio
books, and the MIT's OpenCourseWare project, which
has contributed lecture videos to the Archive. The Internet

IMPACT

The pioneering efforts of the Internet Archive have been


hugely influential in the digital preservation and access
community, and its director Brewster Kahle is well-known
inside and outside this community for his enthusiastic
promotion of the Archive's goals. The organization's in-
fluence has been especially significant in the area of Web
archiving, inspiring and providing models for other pro-
jects and programs around the world, but it has also stimu-
lated debate about the preservation of digital objects and
its collections have already been used in legal discovery
and academic research.

Other Web and Digital Archives

The success of the Internet Archive has been the inspira-


tion for many other large-scale Web archiving programs,

2974

Internet Archive

and it has collaborated with other nonprofit organizations


and national institutions on some of those programs. In
addition to providing a model for Web archiving activity,
the Archive has provided a majority of the technology
currently being used by other institutions to create Web
archiving applications, programs, and collections around
in the world.

In the United States, for instance, the open-source


crawler Heritrix forms the basis of the crawler used by
the Online Computer Library Center's (OCLC) Web
Archiving Workbench application, and the Archive's col-
laboration with the Library of Congress was crucial in
making possible the Library's MINERVA Web Archiving
Project. In Europe, the National Archives of the United
Kingdom uses the Archive to host its earliest Web site
collections dating back to 1996. The Royal Library and
the State and University Library of Denmark have also
developed a Web archiving application, the Netarchive-
Suite, around the Heritrix crawler.

Other efforts have benefited less directly from the


work of the Internet Archive, but undoubtedly have been
influenced by the Archive as the most prominent and old-
est example of Web archiving and a successful example
of the archiving of digital objects. For example, Scientists
at the Los Alamos National Lab in New Mexico are using
the ARC file format as the basis for their aDORe Archive
storage of digital objects.

While building on the work of the Internet Archive,


many cultural heritage institutions have recognized the
limitations of the Archive's collections and have also
recognized a need for their own active participation in
the collection and preservation of important documents
and records falling within their collecting scopes. Those
organizations with a mandate to collect materials now
found on the Internet, whether those materials are organi-
zational records or informational documents, cannot rely
on the Internet Archive to completely and accurately ar-
chive these materials. Instead they must work to develop
collecting, preservation, and access solutions of their own
or work directly with third-party service providers to en-
sure the survival of these documents. Thus, for instance,
many state governments and universities in the United
States have begun Web archiving programs in order to
ensure that their own institutional Web-based records are
preserved for the future, and many libraries are develop-
ing thematically related collections of materials archived
from the Internet.

The International Internet Preservation


Consortium

The International Internet Preservation Consortium (IIPC)


was chartered in 2003, with the Internet Archive joining
with 1 1 national libraries to address the challenge of pre-
serving Internet content. The Internet Archive was the
only nongovernmental charter member. The Consortium
studies Web archiving practices and standards in the areas
of harvesting, preservation, and access. The IIPC is also at
work on a new file format for Web archiving, using ARC
as a starting point. Called WARC, this new format will
include a way to record actions taken to preserve the files.

The Open Content Alliance

The Open Content Alliance (OCA) was founded in 2005,


with Yahoo. The goal of the OCA is to provide free
access to global digital content. Although the Alliance
includes commercial organizations, like Yahoo, Micro-
soft, and Adobe Systems, its purpose is to provide an
alternative to commercial control of human knowledge.
The majority of the members are university and other
large research libraries, public, and private. Kahle envi-
sions an organization that will empower libraries to work
together to keep library content open to the public in
response to his perception of a trend toward increased
corporate control of that content. [10]

The primary focus of the Alliance's work is printed


books and analog audio and video, no longer under copy-
right protection, rather than the born-digital materials that
formed the foundation of the Internet Archive's collec-
tions. As part of the OCA, the Internet Archive offers to
digitize library materials for only a small per page or per
disc cost. The Alliance has not yet made any content
available, but the plans for inaugural collections include
materials from the University of California, the National
Archives of the United Kingdom, O'Reilly Media, and the
European Archive.

Copyright

When collecting materials found on the "live" Web, the


Internet Archive does not gain explicit permission from
their owners to copy and archive them. Instead it relies on
its "opt-out" policy as described above and the doctrine of
fair use, which has a well-established history in the world
of information in analog formats, but becomes more con-
tentious in the digital world, where it is often much easier
to copy and widely disseminate copyrighted materials.
The doctrine of fair use might allow the Internet Archive
to make a copy of a Web site for noncommercial uses, but
it is more questionable whether the doctrine still applies
when the Archive makes that same copy available to
millions of people via the Internet.

Thus far, the organization has not been involved in


much significant litigation regarding copyright issues.
Indeed the most significant lawsuit against the Archive to
date involves not whether it has the right to collect Web
pages but whether it failed to follow its own policy
respecting crawl instructions by copyright owners. [11] Its
policies and its status as a research and education resource
have somewhat reduced the legal risk inherent in its mis-
sion. The Archive's actions are certainly an important test
Internet Archive

2975

of the fair use doctrine in the realm of digital preserva-


tion, but it is still far from clear what the rights of archi-
vists, librarians, and other information professionals are
with respect to collecting, preserving, and making acces-
sible digital objects.

Court Cases

The Archive's Web collection has already been used to


find evidence in both civil and criminal court cases. The
Wayback Machine is becoming well-known among law-
yers, particularly those who handle trademark and domain
name litigation. Lawyers can use the Wayback Machine
to find examples of trademark infringement even if the
offender has already removed the trademarked material
from the "live" site. Although using archived or cached
Web pages as evidence is not unequivocally accepted in
the legal community, this type of evidence has gained
significant attention in the past several years for being of
considerable assistance in the evidence discovery process.
It is likely that there will be more questions about the
reliability and authenticity of this information, especially
when it is used in criminal cases. [12 ' 13]

Scholarship

Researchers in the field of information and library science


have already begun to use the Internet Archive's collec-
tions to perform historical research on the Web and its
development. For example, in 2004, a paper on the acces-
sibility of government Web sites for people with disabil-
ities from 1997-2002 was presented at an Association
for Computing Machinery (ACM) conference. [14] The
authors used the Wayback Machine to view and test gov-
ernment Web sites from the past. Researchers at Cornell
University are building the Cornell Web Library, which
will use the collections of the Internet Archive but will
provide researchers more technologically sophisticated
ways of organizing and analyzing the collections without
requiring them to be sophisticated programmers. [15]

CONCLUSION

The Internet Archive has been an early and prominent


advocate for the preservation of the digital cultural heri-
tage of civilization. Even if its own collections and collect-
ing activity were insignificant, the role the organization,
its founders, and other employees and collaborators have
played in initiating and furthering public dialog about all
aspects of access to information in the digital age are
significant enough to earn the Internet Archive an impor-
tant place in the history of cultural institutions. Its influ-
ence has touched large and small cultural heritage
institutions in areas ranging from the very specific details
of digital preservation (by designing a new preservation

file format, for instance) to the very broad issue of access


to information in digital forms.

Some critics have questioned the long-term viability of


a cultural heritage institution founded by two technologists,
but cultural heritage institutions founded by dedicated phi-
lanthropists are not without successful precedent in the
analog world. Nonetheless, the organization will face many
new challenges in the future if it is to preserve its collec-
tions. Some of these challenges will be technical in nature
as the practical requirements of maintaining a large collec-
tion of digital files in a variety of formats change over time,
but other challenges will require policy-related resolutions.
Allegations of copyright violations have arisen, and given
the nature of the Archive's "opt-out" collecting policy,
there are likely to be more in the future.

Furthermore, the Archive may need to overcome lim-


itations in its collecting methodologies, policies, and tech-
nologies if it is to build a truly comprehensive collection.
The Internet Archive, like other organizations engaged in
archiving the Web and other forms of digital content,
faces four primary problems: the cultural problem (decid-
ing what to save), the technical problem (what technology
is necessary and viable), the economic problem (who will
pay for it), and the legal problem (who owns the material
and who can use it). [16] The Internet Archive has been
finding bold solutions to these problems for over a decade
in its quest to save the cultural heritage of civilization
from catastrophic loss.

REFERENCES

1. Internet Archive Frequently Asked Questions, http://www.


archive.org/about/faqs.php (accessed September 2007).

2. Kahle, B. Preserving the Internet. Sci. Am. 1997, 276 (3),


82-83.

3. Kahle, B.; Lyman, P. Archiving digital cultural artifacts.


D-Lib Mag. 1998, 4 (7), http://www.dlib.org/dlib/july98/
071yman.html (accessed September 2007).

4. Burner, M. Crawling towards eternity: Building an archive


of the World Wide Web. Web Tech. Mag. 1997, 2 (5),
37-40, http://www.webtechniques.com/archives/1997/05/
burner/ (accessed September 2007).

5. Mohr, G. IA/IIPC open source tools update. In 7th Interna-


tional Web Archiving Workshop, Vancouver, Canada,
June 23, 2007, http://www.iwaw.net/07/mohr-iwaw07.pdf
(accessed September 2007).

6. Burner, M.; Kahle, B. Arc file format, http://www.archive.


org/web/researcher/ArcFileFormat.php (accessed September
2007).

7. Kahle, B. Editor's interview: The Internet Archive, an


interview with Brewster Kahle. RLG DigiNews 2002, 6
(3), http://digitalarchive.oclc.org/da/ViewObjectMain.jsp?
fileid= 00000705 19:000006287741&amp;reqid= 3550#interview
(accessed September 2007).

8. Thelwall, M.; Vaughan, L. A fair history of the Web?


Examining country balance in the Internet. Archive. Libr.
Inform. Sci. Res. 2004, 26, 162-176.

2976

Internet Archive

9. Archive-It Questions, http://www.archive-it.org/public/faq


(accessed September 2007).

10. Albanese, A.R. Scan this book! Libr. J. 2007, 132, 32-35.
August, http://www.libraryjournal.com/article/CA6466634.
html (accessed September 2007).

11. Zeller, T. Keeper of expired web pages is sued because


archive was used in another suit. New York Times. July
13, 2005, C9.

12. Kesmodel, D. Not fade away — Lawyers' delight: Old web


material doesn't disappear; Wayback Machine and Google
archive billions of pages, including deleted ones; playboy
protests 'sex court.' Wall Street J. July 27, 2005, Al.

13. Fagan, M. 'Can you do a wayback on that?' The legal


community's use of cached web pages in and out of trial.
Boston Univ. J. Sci. Technol. Law 2007, 13(1), 46-73.

14. Hackett, S.; Parmanto, B.; Zeng, X. Accessibility of Inter-


net websites through time, In Proceedings of the 6th Interna-
tional ACM S1GACCESS Conference on Computers and
Accessibility, Atlanta, GA, October 18-20, 1994; 32-39.

15. Arms, W.; Aya, S.; Dmitriev, P.; Kot, B.; Mitchell, R.;
Walle, L. A research library based on the historical collec-
tions of the Internet Archive. D-Lib Mag. 2006, 12 (2),
http://www.dlib.Org/dlib/february06/arms/02arms.html#2
(accessed September 2007).

16. Lyman, P. Archiving the World Wide Web. In Building a


National Strategy for Preservation: Issues in Digital Me-
dia Archiving; Council on Library and Information
Resources and the Library of Congress: Washington, DC,

2002, http://www.clir.org/pubs/reports/publ06/web.html
(accessed September 2007).

BIBLIOGRAPHY

Brown, A. Archiving Websites: A Guide for Information


Management Professionals; London: Facet, 2006.
Kahle, B. Editor's interview: The Internet Archive, an
interview with Brewster Kahle. RLG DigiNews 2002, 6
(3), http://www.rlg.org/preserv/diginews/diginews6-3.
html#interview.

Kahle, B. Preserving the Internet. Sci. Am. 1997, 276 (3),


82-83.

Kahle, B.; Lyman, P. Archiving digital cultural artifacts.


D-Lib Mag. 1998, 4 (7), http://www.dlib.org/dlib/july98/
071yman.html.

Kimpton, M.; Ubois, J. Year-by-year: From an archive of


the Internet to an archive on the Internet. In Web Archiv-
ing; Masanes, J., Ed.; Springer: Berlin, 2006; 201-212.
Masanes, J., Ed. Web Archiving; Springer: Berlin, 2006.
Mohr, G; Stack, M.; Ranitovic, I.; Avery, D.; Kimpton, M.
An Introduction to Heritrix, an Open Source Archival
Quality Web Crawler. In Presentation at the 4th Interna-
tional Web Archiving Workshop, Bath, September 16,
2004, http://www.iwaw.net/04/Mohr.pdf.

2.

3.

4.

5.
6.

7.

</pre> </div><!--/.container-->
</main>
</div><!--/#wrap-->

<!-- Timing ...


rendered on: www27.us.archive.org
seconds diff sec message stack(file:line:function)
=========================================================
0.0000 0.0000 petabox start
var/cache/petabox/petabox/www/sf/download.php:1:require
|common/ia:66:require_once
|setup.php:312:log
0.0057 0.0057 call get_redis()
var/cache/petabox/petabox/www/sf/download.php:86:main_wrap
|download.php:107:main
|download.php:328:getItem
|common/Item.inc:77:parseMetadata
|Item.inc:132:get_obj
|Metadata.inc:466:get_json_obj
|Metadata.inc:2176:log
0.0072 0.0015 redis_read start
var/cache/petabox/petabox/www/sf/download.php:86:main_wrap
|download.php:107:main
|download.php:328:getItem
|common/Item.inc:77:parseMetadata
|Item.inc:132:get_obj
|Metadata.inc:466:get_json_obj
|Metadata.inc:2230:fetch
|Metadata/RecordServer.inc:141:execute
|RecordServer/FetchRecordOp.inc:52:log
0.0366 0.0294 redis_read finish
var/cache/petabox/petabox/www/sf/download.php:86:main_wrap
|download.php:107:main
|download.php:328:getItem
|common/Item.inc:77:parseMetadata
|Item.inc:132:get_obj
|Metadata.inc:466:get_json_obj
|Metadata.inc:2230:fetch
|Metadata/RecordServer.inc:141:execute
|RecordServer/FetchRecordOp.inc:58:log
0.0453 0.0087 begin session_start
var/cache/petabox/petabox/www/sf/download.php:86:main_wrap
|download.php:107:main
|download.php:545:stream
|download.php:957:head
|common/Nav.inc:127:__construct
|Nav.inc:196:session_start
|Cookies.inc:56:log
0.0454 0.0001 done session_start
var/cache/petabox/petabox/www/sf/download.php:86:main_wrap
|download.php:107:main
|download.php:545:stream
|download.php:957:head
|common/Nav.inc:127:__construct
|Nav.inc:196:session_start
|Cookies.inc:62:log
0.2127 0.1673 bug dump
var/cache/petabox/petabox/www/sf/download.php:86:main_wrap
|download.php:107:main
|download.php:545:stream
|download.php:987:footer
|common/setup.php:158:footer
|Nav.inc:1540:dump
|Bug.inc:102:log
-->
<script type="text/javascript">
if (window.archive_analytics) {
var vs = window.archive_analytics.get_data_packets();
for (var i in vs) {
vs[i]['cache_bust']=Math.random();
vs[i]['server_ms']=212;
vs[i]['server_name']="www27.us.archive.org";
}

if ($(".more_search").size()>0) {
window.archive_analytics.send_scroll_fetch_base_event();
}
}
</script>
</div>
</body>
</html>

Вам также может понравиться