How to query robots.txt

Query robots.txt using the robots.txt exclusion protocol implementation for Go language.

Golang application

Use the following application to display and query robots.txt.

package main

import (
	"flag"
	"fmt"
	"github.com/temoto/robotstxt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
)

func main() {
	// define status var and exit code
	var status string
	var exitCode int

	// define flags
	domain := flag.String("domain", "localhost", "domain name")
	path := flag.String("path", "/", "path")
	verbose := flag.Bool("verbose", true, "verbose mode")
	agent := flag.String("agent", "Google", "user agent to check")

	// define default message
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Check path against robots.txt for given domain\n")
		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
		flag.PrintDefaults()
	}

	// parse command line parameters
	flag.Parse()

	// Get the contents of robots.txt
	response, err := http.Get("https://" + *domain + "/robots.txt")
	if err != nil {
		log.Fatal(err)
	}

	// Read robots.txt
	defer response.Body.Close()
	robotsContents, err := ioutil.ReadAll(response.Body)
	if err != nil {
		log.Fatal(err)
	}

	// display robots.txt when verbose mode is defined
	if *verbose == true {
		fmt.Println("robots.txt contents:")
		fmt.Println("--------------------")

		fmt.Printf("%s", robotsContents)
		fmt.Println()

	}

	// Parse robots.txt
	robotsData, err := robotstxt.FromString(string(robotsContents))
	if err != nil {
		log.Fatal(err)
	}

	// Test the path against the User-Agent group
	if robotsData.TestAgent(*path, *agent) == true {
		status = "OK"
		exitCode = 0
	} else {
		status = "NOT OK"
		exitCode = 1
	}

	// display robots.txt when verbose mode is defined
	if *verbose == true {
    fmt.Println()
		fmt.Println("robots.txt check:")
		fmt.Println("--------------------")
		fmt.Fprintf(os.Stdout, "https://%s%s - %s\n", *domain, *path, status)

	}

	// exit, indicate result using exit code
	os.Exit(exitCode)
}

Usage

Display parameters.

$ robots --help
Check path against robots.txt for given domain
Usage of ./robots:
  -agent, string
        user agent to check (default "Google")
  -domain string
        domain name (default "localhost")
  -path string
        path (default "/")
  -verbose
        verbose mode (default true)

Verbose query.

$ robots -domain google.pl -agent AdsBot-Goole -path /maps/api/place/js/
robots.txt contents:
--------------------
User-agent: *
Disallow: /search
Allow: /search/about
Allow: /search/static
Allow: /search/howsearchworks
Disallow: /sdch
Disallow: /groups
Disallow: /index.html?
Disallow: /?
Allow: /?hl=
Disallow: /?hl=*&
Allow: /?hl=*&gws_rd=ssl$
Disallow: /?hl=*&*&gws_rd=ssl
Allow: /?gws_rd=ssl$
Allow: /?pt1=true$
Disallow: /imgres
Disallow: /u/
Disallow: /preferences
Disallow: /setprefs
Disallow: /default
Disallow: /m?
Disallow: /m/
Allow:    /m/finance
Disallow: /wml?
Disallow: /wml/?
Disallow: /wml/search?
Disallow: /xhtml?
Disallow: /xhtml/?
Disallow: /xhtml/search?
Disallow: /xml?
Disallow: /imode?
Disallow: /imode/?
Disallow: /imode/search?
Disallow: /jsky?
Disallow: /jsky/?
Disallow: /jsky/search?
Disallow: /pda?
Disallow: /pda/?
Disallow: /pda/search?
Disallow: /sprint_xhtml
Disallow: /sprint_wml
Disallow: /pqa
Disallow: /palm
Disallow: /gwt/
Disallow: /purchases
Disallow: /local?
Disallow: /local_url
Disallow: /shihui?
Disallow: /shihui/
Disallow: /products?
Disallow: /product_
Disallow: /products_
Disallow: /products;
Disallow: /print
Disallow: /books/
Disallow: /bkshp?*q=*
Disallow: /books?*q=*
Disallow: /books?*output=*
Disallow: /books?*pg=*
Disallow: /books?*jtp=*
Disallow: /books?*jscmd=*
Disallow: /books?*buy=*
Disallow: /books?*zoom=*
Allow: /books?*q=related:*
Allow: /books?*q=editions:*
Allow: /books?*q=subject:*
Allow: /books/about
Allow: /booksrightsholders
Allow: /books?*zoom=1*
Allow: /books?*zoom=5*
Allow: /books/content?*zoom=1*
Allow: /books/content?*zoom=5*
Disallow: /ebooks/
Disallow: /ebooks?*q=*
Disallow: /ebooks?*output=*
Disallow: /ebooks?*pg=*
Disallow: /ebooks?*jscmd=*
Disallow: /ebooks?*buy=*
Disallow: /ebooks?*zoom=*
Allow: /ebooks?*q=related:*
Allow: /ebooks?*q=editions:*
Allow: /ebooks?*q=subject:*
Allow: /ebooks?*zoom=1*
Allow: /ebooks?*zoom=5*
Disallow: /patents?
Disallow: /patents/download/
Disallow: /patents/pdf/
Disallow: /patents/related/
Disallow: /scholar
Disallow: /citations?
Allow: /citations?user=
Disallow: /citations?*cstart=
Allow: /citations?view_op=new_profile
Allow: /citations?view_op=top_venues
Allow: /scholar_share
Disallow: /s?
Allow: /maps?*output=classic*
Allow: /maps?*file=
Allow: /maps/api/js?
Allow: /maps/d/
Disallow: /maps?
Disallow: /mapstt?
Disallow: /mapslt?
Disallow: /maps/stk/
Disallow: /maps/br?
Disallow: /mapabcpoi?
Disallow: /maphp?
Disallow: /mapprint?
Disallow: /maps/api/js/
Disallow: /maps/api/place/js/
Disallow: /maps/api/staticmap?
Disallow: /maps/api/streetview?
Disallow: /maps/api/streetview/
Disallow: /maps/_/sw/manifest.json
Disallow: /mld?
Disallow: /staticmap?
Disallow: /maps/preview
Disallow: /maps/place
Disallow: /maps/timeline/
Disallow: /help/maps/streetview/partners/welcome/
Disallow: /help/maps/indoormaps/partners/
Disallow: /lochp?
Disallow: /center
Disallow: /ie?
Disallow: /blogsearch/
Disallow: /blogsearch_feeds
Disallow: /advanced_blog_search
Disallow: /uds/
Disallow: /chart?
Disallow: /transit?
Allow:    /calendar$
Allow:    /calendar/about/
Disallow: /calendar/
Disallow: /cl2/feeds/
Disallow: /cl2/ical/
Disallow: /coop/directory
Disallow: /coop/manage
Disallow: /trends?
Disallow: /trends/music?
Disallow: /trends/hottrends?
Disallow: /trends/viz?
Disallow: /trends/embed.js?
Disallow: /trends/fetchComponent?
Disallow: /trends/beta
Disallow: /trends/topics
Disallow: /musica
Disallow: /musicad
Disallow: /musicas
Disallow: /musicl
Disallow: /musics
Disallow: /musicsearch
Disallow: /musicsp
Disallow: /musiclp
Disallow: /urchin_test/
Disallow: /movies?
Disallow: /wapsearch?
Allow: /safebrowsing/diagnostic
Allow: /safebrowsing/report_badware/
Allow: /safebrowsing/report_error/
Allow: /safebrowsing/report_phish/
Disallow: /reviews/search?
Disallow: /orkut/albums
Disallow: /cbk
Allow: /cbk?output=tile&cb_client=maps_sv
Disallow: /maps/api/js/AuthenticationService.Authenticate
Disallow: /maps/api/js/QuotaService.RecordEvent
Disallow: /recharge/dashboard/car
Disallow: /recharge/dashboard/static/
Disallow: /profiles/me
Allow: /profiles
Disallow: /s2/profiles/me
Allow: /s2/profiles
Allow: /s2/oz
Allow: /s2/photos
Allow: /s2/search/social
Allow: /s2/static
Disallow: /s2
Disallow: /transconsole/portal/
Disallow: /gcc/
Disallow: /aclk
Disallow: /cse?
Disallow: /cse/home
Disallow: /cse/panel
Disallow: /cse/manage
Disallow: /tbproxy/
Disallow: /imesync/
Disallow: /shenghuo/search?
Disallow: /support/forum/search?
Disallow: /reviews/polls/
Disallow: /hosted/images/
Disallow: /ppob/?
Disallow: /ppob?
Disallow: /accounts/ClientLogin
Disallow: /accounts/ClientAuth
Disallow: /accounts/o8
Allow: /accounts/o8/id
Disallow: /topicsearch?q=
Disallow: /xfx7/
Disallow: /squared/api
Disallow: /squared/search
Disallow: /squared/table
Disallow: /qnasearch?
Disallow: /app/updates
Disallow: /sidewiki/entry/
Disallow: /quality_form?
Disallow: /labs/popgadget/search
Disallow: /buzz/post
Disallow: /compressiontest/
Disallow: /analytics/feeds/
Disallow: /analytics/partners/comments/
Disallow: /analytics/portal/
Disallow: /analytics/uploads/
Allow: /alerts/manage
Allow: /alerts/remove
Disallow: /alerts/
Allow: /alerts/$
Disallow: /ads/search?
Disallow: /ads/plan/action_plan?
Disallow: /ads/plan/api/
Disallow: /ads/hotels/partners
Disallow: /phone/compare/?
Disallow: /travel/clk
Disallow: /travel/hotelier/terms/
Disallow: /hotelfinder/rpc
Disallow: /hotels/rpc
Disallow: /commercesearch/services/
Disallow: /evaluation/
Disallow: /chrome/browser/mobile/tour
Disallow: /compare/*/apply*
Disallow: /forms/perks/
Disallow: /shopping/suppliers/search
Disallow: /ct/
Disallow: /edu/cs4hs/
Disallow: /trustedstores/s/
Disallow: /trustedstores/tm2
Disallow: /trustedstores/verify
Disallow: /adwords/proposal
Disallow: /shopping/product/
Disallow: /shopping/seller
Disallow: /shopping/ratings/account/metrics
Disallow: /shopping/reviewer
Disallow: /about/careers/applications/
Disallow: /landing/signout.html
Disallow: /webmasters/sitemaps/ping?
Disallow: /ping?
Disallow: /gallery/
Disallow: /landing/now/ontap/
Allow: /searchhistory/
Allow: /maps/reserve
Allow: /maps/reserve/partners
Disallow: /maps/reserve/api/
Disallow: /maps/reserve/search
Disallow: /maps/reserve/bookings
Disallow: /maps/reserve/settings
Disallow: /maps/reserve/manage
Disallow: /maps/reserve/payment
Disallow: /maps/reserve/receipt
Disallow: /maps/reserve/sellersignup
Disallow: /maps/reserve/payments
Disallow: /maps/reserve/feedback
Disallow: /maps/reserve/terms
Disallow: /maps/reserve/m/
Disallow: /maps/reserve/b/
Disallow: /maps/reserve/partner-dashboard
Disallow: /about/views/
Disallow: /intl/*/about/views/
Disallow: /local/dining/
Disallow: /local/place/products/
Disallow: /local/place/reviews/
Disallow: /local/place/rap/
Disallow: /local/tab/
Allow: /finance
Allow: /js/
Disallow: /finance?*q=*

# AdsBot
User-agent: AdsBot-Google
Allow: /maps/api/js?
Disallow: /maps/api/js/
Disallow: /maps/api/place/js/
Disallow: /maps/api/staticmap?
Disallow: /maps/api/streetview?
Disallow: /maps/api/streetview/

# Certain social media sites are whitelisted to allow crawlers to access page markup when links to google.com/imgres* are shared. To learn more, please contact [email protected]
User-agent: Twitterbot
Allow: /imgres

User-agent: facebookexternalhit
Allow: /imgres

Sitemap: https://www.google.com/sitemap.xml


robots.txt check:
--------------------
https://google.pl/maps/api/place/js/ - NOT OK

Silent query.

$ robots -domain linux.com -path /search/ --verbose=false
$ echo $?
1

Additional notes

Regular binary size.

$ GOOS=linux go build robots.go 
$ ls -lh robots
-rwxrwxr-x 1 milosz milosz 6.3M Jul 24 00:04 robots

Stripped binary size.

$ GOOS=linux go build -ldflags="-s -w" robots.go 
$ ls -lhs robots
4.1M -rwxrwxr-x 1 milosz milosz 4.1M Jul 24 00:17 robots