Categories
WebOps

How to query robots.txt

Query robots.txt using the robots.txt exclusion protocol implementation for Go language.

Golang application

Use the following application to display and query robots.txt.

package main

import (
	"flag"
	"fmt"
	"github.com/temoto/robotstxt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
)

func main() {
	// define status var and exit code
	var status string
	var exitCode int

	// define flags
	domain := flag.String("domain", "localhost", "domain name")
	path := flag.String("path", "/", "path")
	verbose := flag.Bool("verbose", true, "verbose mode")
	agent := flag.String("agent", "Google", "user agent to check")

	// define default message
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Check path against robots.txt for given domain\n")
		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
		flag.PrintDefaults()
	}

	// parse command line parameters
	flag.Parse()

	// Get the contents of robots.txt
	response, err := http.Get("https://" + *domain + "/robots.txt")
	if err != nil {
		log.Fatal(err)
	}

	// Read robots.txt
	defer response.Body.Close()
	robotsContents, err := ioutil.ReadAll(response.Body)
	if err != nil {
		log.Fatal(err)
	}

	// display robots.txt when verbose mode is defined
	if *verbose == true {
		fmt.Println("robots.txt contents:")
		fmt.Println("--------------------")

		fmt.Printf("%s", robotsContents)
		fmt.Println()

	}

	// Parse robots.txt
	robotsData, err := robotstxt.FromString(string(robotsContents))
	if err != nil {
		log.Fatal(err)
	}

	// Test the path against the User-Agent group
	if robotsData.TestAgent(*path, *agent) == true {
		status = "OK"
		exitCode = 0
	} else {
		status = "NOT OK"
		exitCode = 1
	}

	// display robots.txt when verbose mode is defined
	if *verbose == true {
    fmt.Println()
		fmt.Println("robots.txt check:")
		fmt.Println("--------------------")
		fmt.Fprintf(os.Stdout, "https://%s%s - %s\n", *domain, *path, status)

	}

	// exit, indicate result using exit code
	os.Exit(exitCode)
}

Usage

Display parameters.

$ robots --help
Check path against robots.txt for given domain
Usage of ./robots:
  -agent, string
        user agent to check (default "Google")
  -domain string
        domain name (default "localhost")
  -path string
        path (default "/")
  -verbose
        verbose mode (default true)

Verbose query.

$ robots -domain google.pl -agent AdsBot-Goole -path /maps/api/place/js/
robots.txt contents:
--------------------
User-agent: *
Disallow: /search
Allow: /search/about
Allow: /search/static
Allow: /search/howsearchworks
Disallow: /sdch
Disallow: /groups
Disallow: /index.html?
Disallow: /?
Allow: /?hl=
Disallow: /?hl=*&
Allow: /?hl=*&gws_rd=ssl$
Disallow: /?hl=*&*&gws_rd=ssl
Allow: /?gws_rd=ssl$
Allow: /?pt1=true$
Disallow: /imgres
Disallow: /u/
Disallow: /preferences
Disallow: /setprefs
Disallow: /default
Disallow: /m?
Disallow: /m/
Allow:    /m/finance
Disallow: /wml?
Disallow: /wml/?
Disallow: /wml/search?
Disallow: /xhtml?
Disallow: /xhtml/?
Disallow: /xhtml/search?
Disallow: /xml?
Disallow: /imode?
Disallow: /imode/?
Disallow: /imode/search?
Disallow: /jsky?
Disallow: /jsky/?
Disallow: /jsky/search?
Disallow: /pda?
Disallow: /pda/?
Disallow: /pda/search?
Disallow: /sprint_xhtml
Disallow: /sprint_wml
Disallow: /pqa
Disallow: /palm
Disallow: /gwt/
Disallow: /purchases
Disallow: /local?
Disallow: /local_url
Disallow: /shihui?
Disallow: /shihui/
Disallow: /products?
Disallow: /product_
Disallow: /products_
Disallow: /products;
Disallow: /print
Disallow: /books/
Disallow: /bkshp?*q=*
Disallow: /books?*q=*
Disallow: /books?*output=*
Disallow: /books?*pg=*
Disallow: /books?*jtp=*
Disallow: /books?*jscmd=*
Disallow: /books?*buy=*
Disallow: /books?*zoom=*
Allow: /books?*q=related:*
Allow: /books?*q=editions:*
Allow: /books?*q=subject:*
Allow: /books/about
Allow: /booksrightsholders
Allow: /books?*zoom=1*
Allow: /books?*zoom=5*
Allow: /books/content?*zoom=1*
Allow: /books/content?*zoom=5*
Disallow: /ebooks/
Disallow: /ebooks?*q=*
Disallow: /ebooks?*output=*
Disallow: /ebooks?*pg=*
Disallow: /ebooks?*jscmd=*
Disallow: /ebooks?*buy=*
Disallow: /ebooks?*zoom=*
Allow: /ebooks?*q=related:*
Allow: /ebooks?*q=editions:*
Allow: /ebooks?*q=subject:*
Allow: /ebooks?*zoom=1*
Allow: /ebooks?*zoom=5*
Disallow: /patents?
Disallow: /patents/download/
Disallow: /patents/pdf/
Disallow: /patents/related/
Disallow: /scholar
Disallow: /citations?
Allow: /citations?user=
Disallow: /citations?*cstart=
Allow: /citations?view_op=new_profile
Allow: /citations?view_op=top_venues
Allow: /scholar_share
Disallow: /s?
Allow: /maps?*output=classic*
Allow: /maps?*file=
Allow: /maps/api/js?
Allow: /maps/d/
Disallow: /maps?
Disallow: /mapstt?
Disallow: /mapslt?
Disallow: /maps/stk/
Disallow: /maps/br?
Disallow: /mapabcpoi?
Disallow: /maphp?
Disallow: /mapprint?
Disallow: /maps/api/js/
Disallow: /maps/api/place/js/
Disallow: /maps/api/staticmap?
Disallow: /maps/api/streetview?
Disallow: /maps/api/streetview/
Disallow: /maps/_/sw/manifest.json
Disallow: /mld?
Disallow: /staticmap?
Disallow: /maps/preview
Disallow: /maps/place
Disallow: /maps/timeline/
Disallow: /help/maps/streetview/partners/welcome/
Disallow: /help/maps/indoormaps/partners/
Disallow: /lochp?
Disallow: /center
Disallow: /ie?
Disallow: /blogsearch/
Disallow: /blogsearch_feeds
Disallow: /advanced_blog_search
Disallow: /uds/
Disallow: /chart?
Disallow: /transit?
Allow:    /calendar$
Allow:    /calendar/about/
Disallow: /calendar/
Disallow: /cl2/feeds/
Disallow: /cl2/ical/
Disallow: /coop/directory
Disallow: /coop/manage
Disallow: /trends?
Disallow: /trends/music?
Disallow: /trends/hottrends?
Disallow: /trends/viz?
Disallow: /trends/embed.js?
Disallow: /trends/fetchComponent?
Disallow: /trends/beta
Disallow: /trends/topics
Disallow: /musica
Disallow: /musicad
Disallow: /musicas
Disallow: /musicl
Disallow: /musics
Disallow: /musicsearch
Disallow: /musicsp
Disallow: /musiclp
Disallow: /urchin_test/
Disallow: /movies?
Disallow: /wapsearch?
Allow: /safebrowsing/diagnostic
Allow: /safebrowsing/report_badware/
Allow: /safebrowsing/report_error/
Allow: /safebrowsing/report_phish/
Disallow: /reviews/search?
Disallow: /orkut/albums
Disallow: /cbk
Allow: /cbk?output=tile&cb_client=maps_sv
Disallow: /maps/api/js/AuthenticationService.Authenticate
Disallow: /maps/api/js/QuotaService.RecordEvent
Disallow: /recharge/dashboard/car
Disallow: /recharge/dashboard/static/
Disallow: /profiles/me
Allow: /profiles
Disallow: /s2/profiles/me
Allow: /s2/profiles
Allow: /s2/oz
Allow: /s2/photos
Allow: /s2/search/social
Allow: /s2/static
Disallow: /s2
Disallow: /transconsole/portal/
Disallow: /gcc/
Disallow: /aclk
Disallow: /cse?
Disallow: /cse/home
Disallow: /cse/panel
Disallow: /cse/manage
Disallow: /tbproxy/
Disallow: /imesync/
Disallow: /shenghuo/search?
Disallow: /support/forum/search?
Disallow: /reviews/polls/
Disallow: /hosted/images/
Disallow: /ppob/?
Disallow: /ppob?
Disallow: /accounts/ClientLogin
Disallow: /accounts/ClientAuth
Disallow: /accounts/o8
Allow: /accounts/o8/id
Disallow: /topicsearch?q=
Disallow: /xfx7/
Disallow: /squared/api
Disallow: /squared/search
Disallow: /squared/table
Disallow: /qnasearch?
Disallow: /app/updates
Disallow: /sidewiki/entry/
Disallow: /quality_form?
Disallow: /labs/popgadget/search
Disallow: /buzz/post
Disallow: /compressiontest/
Disallow: /analytics/feeds/
Disallow: /analytics/partners/comments/
Disallow: /analytics/portal/
Disallow: /analytics/uploads/
Allow: /alerts/manage
Allow: /alerts/remove
Disallow: /alerts/
Allow: /alerts/$
Disallow: /ads/search?
Disallow: /ads/plan/action_plan?
Disallow: /ads/plan/api/
Disallow: /ads/hotels/partners
Disallow: /phone/compare/?
Disallow: /travel/clk
Disallow: /travel/hotelier/terms/
Disallow: /hotelfinder/rpc
Disallow: /hotels/rpc
Disallow: /commercesearch/services/
Disallow: /evaluation/
Disallow: /chrome/browser/mobile/tour
Disallow: /compare/*/apply*
Disallow: /forms/perks/
Disallow: /shopping/suppliers/search
Disallow: /ct/
Disallow: /edu/cs4hs/
Disallow: /trustedstores/s/
Disallow: /trustedstores/tm2
Disallow: /trustedstores/verify
Disallow: /adwords/proposal
Disallow: /shopping/product/
Disallow: /shopping/seller
Disallow: /shopping/ratings/account/metrics
Disallow: /shopping/reviewer
Disallow: /about/careers/applications/
Disallow: /landing/signout.html
Disallow: /webmasters/sitemaps/ping?
Disallow: /ping?
Disallow: /gallery/
Disallow: /landing/now/ontap/
Allow: /searchhistory/
Allow: /maps/reserve
Allow: /maps/reserve/partners
Disallow: /maps/reserve/api/
Disallow: /maps/reserve/search
Disallow: /maps/reserve/bookings
Disallow: /maps/reserve/settings
Disallow: /maps/reserve/manage
Disallow: /maps/reserve/payment
Disallow: /maps/reserve/receipt
Disallow: /maps/reserve/sellersignup
Disallow: /maps/reserve/payments
Disallow: /maps/reserve/feedback
Disallow: /maps/reserve/terms
Disallow: /maps/reserve/m/
Disallow: /maps/reserve/b/
Disallow: /maps/reserve/partner-dashboard
Disallow: /about/views/
Disallow: /intl/*/about/views/
Disallow: /local/dining/
Disallow: /local/place/products/
Disallow: /local/place/reviews/
Disallow: /local/place/rap/
Disallow: /local/tab/
Allow: /finance
Allow: /js/
Disallow: /finance?*q=*

# AdsBot
User-agent: AdsBot-Google
Allow: /maps/api/js?
Disallow: /maps/api/js/
Disallow: /maps/api/place/js/
Disallow: /maps/api/staticmap?
Disallow: /maps/api/streetview?
Disallow: /maps/api/streetview/

# Certain social media sites are whitelisted to allow crawlers to access page markup when links to google.com/imgres* are shared. To learn more, please contact images-robots-whitelist@google.com.
User-agent: Twitterbot
Allow: /imgres

User-agent: facebookexternalhit
Allow: /imgres

Sitemap: https://www.google.com/sitemap.xml


robots.txt check:
--------------------
https://google.pl/maps/api/place/js/ - NOT OK

Silent query.

$ robots -domain linux.com -path /search/ --verbose=false
$ echo $?
1

Additional notes

Regular binary size.

$ GOOS=linux go build robots.go
$ ls -lh robots
-rwxrwxr-x 1 milosz milosz 6.3M Jul 24 00:04 robots

Stripped binary size.

$ GOOS=linux go build -ldflags="-s -w" robots.go
$ ls -lhs robots
4.1M -rwxrwxr-x 1 milosz milosz 4.1M Jul 24 00:17 robots