When I’m load testing a site, I like to get a list of urls to run against. There’s not much point in checking the home page constantly, let’s find some variety.
This script expects a sitemap or sitemap index, and will give you back a text file with urls.
There is a dependency on xmlstarlet
, a command line program for dealing with XML files. If you’re using homebrew
it is simple to install xmlstarlet
with brew install xmlstarlet
.
When using XML to deal with a sitemap, and the namespaced elements in one, bind the namespace to a prefix and prepend it to the name, like this
xmlstarlet sel -N x='http://www.sitemaps.org/schemas/sitemap/0.9'
Source: http://xmlstar.sourceforge.net/doc/UG/xmlstarlet-ug.html#idm47077139669232
function get_urls_from_sitemap {
# $1 sitemap_index
SITEMAP_INDEX=$1
OUTPUT_FIlE=urls.txt
# Reset the output file
: > $OUTPUT_FIlE
# We use the namespaced in a few places so plop it here
XMLSCHEMA='http://www.sitemaps.org/schemas/sitemap/0.9'
# Check we got an XML file first by checking the content type
isXML=$(curl -sS -o sitemap_index.xml -w '%{content_type}' "$SITEMAP_INDEX")
# If it is an XML file, let's go with it.
# We'll get errors if it isn't a sitemap anyway
if [[ $isXML = *"text/xml"* ]]; then
echo "Getting urls from index: $SITEMAP_INDEX"
# Read the sitemap index
xmlstarlet sel -N x=$XMLSCHEMA -t -v '//x:loc' -n <sitemap_index.xml > sitemaps.txt
# Then loop through the results!
exec 4< sitemaps.txt
while read <&4 SITEMAP; do
# Some of these are url encoded, just quietly fix that!
SITEMAP_URL=$(echo "$SITEMAP" | sed "s/\&/\&/g")
# This is the same content type check from before
isXML=$(curl -sS -o sitemap.txt -w '%{content_type}' $SITEMAP_URL)
if [[ $isXML = *"text/xml"* ]]; then
# If this is an XML file, get more urls from it!
echo "Getting urls from sitemap: $SITEMAP_URL"
xmlstarlet sel -N x=$XMLSCHEMA -t -v '//x:loc' -n <sitemap.txt >> $OUTPUT_FIlE
else
# Just add non XML to the urls file
echo $SITEMAP_URL >> $OUTPUT_FIlE
fi
rm -f sitemap.txt
done
rm -f sitemaps.txt
else
echo "Yo, this isn't an XML file"
fi
rm -f sitemap_index.xml
}