JENS MALMGREN I create, that is my hobby.

Porting my blog for the second time, finished loading old blog data

This is post #17 of my series about how I port this blog from Blogengine.NET 2.5 ASPX on a Windows Server 2003 to a Linux Ubuntu server, Apache2, MySQL and PHP. A so called LAMP. The introduction to this project can be found in this blog post /post/Porting-my-blog-for-the-second-time-Project-can-start.

Now when I finished loading the data I would like to celebrate this by posting my entire complete perl program reading the XML and loading it into the database.

#!/usr/bin/perl
use strict;
use warnings;
use XML::LibXML; # apt-get install libxml-libxml-perl
use DBI; # apt-get install libdbd-mysql-perl
use Switch; #apt-get install libswitch-perl
use URI::Encode qw(uri_encode uri_decode); #apt-get install liburi-encode-perl
use LWP::Simple;
use HTML::Entities # apt-get install libhtml-parser-perl

binmode STDIN, ':encoding(UTF-8)';

# Database connection
my $dbh = DBI-≻connect('dbi:mysql:database=jensblog;#host=localhost','randomuser','somepassword',{AutoCommit=≻1,RaiseError=≻1,PrintError=≻1});
$dbh-≻do( "set names utf8" );

my $deleteDataBase = 1;
my $insertCategories = 1;
my $insertPost = 1;
my $insertPostCategories = 1;
my $loadFeedback = 1;
my $loadTags = 1;
my $downloadImages = 1;

# Delete data from the tables
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-tags.aspx
if ($deleteDataBase)
{
	$dbh-≻do('delete from PostTag');
	$dbh-≻do('delete from PostCategory');
	$dbh-≻do('delete from PostURL');
	$dbh-≻do('delete from URL');
	$dbh-≻do('delete from Tag');
	$dbh-≻do('delete from Category');
	$dbh-≻do('delete from Feedback');
	$dbh-≻do('delete from Post');
	
	$dbh-≻do('ALTER TABLE Post AUTO_INCREMENT = 1');
	$dbh-≻do('ALTER TABLE PostTag AUTO_INCREMENT = 1');
	$dbh-≻do('ALTER TABLE PostURL AUTO_INCREMENT = 1');
	$dbh-≻do('ALTER TABLE PostCategory AUTO_INCREMENT = 1');
	$dbh-≻do('ALTER TABLE URL AUTO_INCREMENT = 1');
	$dbh-≻do('ALTER TABLE Tag AUTO_INCREMENT = 1');
	$dbh-≻do('ALTER TABLE Category AUTO_INCREMENT = 1');
	$dbh-≻do('ALTER TABLE Feedback AUTO_INCREMENT = 1');
}

# For detection of duplicate slug names.
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx
my %dictSlugToID = ();

# Load categories
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data.aspx
my $filename = "/usr/local/bin/jensblog/App_Data/categories.xml";
my $parser = XML::LibXML-≻new("1.0", "UTF-8");
my $xmldoc = $parser-≻parse_file($filename);
my %dictGUIDToID = ();
my %dictTargetAndTagNameToURLID = ();

if ($insertCategories)
{
	for my $category_node ($xmldoc-≻findnodes("//category"))
	{
		my $strGuidId = $category_node -≻ findnodes('@id') -≻ to_literal;
		my $strCategoryName = $category_node-≻textContent;
		$dbh-≻do('INSERT INTO Category (Name) VALUES (?)', undef, $strCategoryName);
		my $intId = $dbh-≻{mysql_insertid};
		$dictGUIDToID{ $strGuidId } = $intId;
		print $strGuidId . " " . $strCategoryName. " " . $intId . "
";
	}
}


Walk('/usr/local/bin/jensblog/App_Data/posts');
# LoadPost('/usr/local/bin/jensblog/App_Data/posts/afe00c46-b999-4305-8ca5-7d7f5ce9ab23.xml');
# LoadPost('/usr/local/bin/jensblog/App_Data/posts/302eaa17-dca5-427f-af96-c39bdb428c41.xml');
# LoadPost('/usr/local/bin/jensblog/App_Data/posts/8d4c8839-f196-480c-a1d7-7f0fb53b9e2f.xml');

# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-2.aspx
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data.aspx
sub Walk
{
	my ($directory) = @_;
	
	opendir my $dirHandle, $directory or die "Failed to open $directory: $!";
	my @dirItems = readdir $dirHandle;
	
	for my $dirItem (@dirItems)
	{
		if ($dirItem =~ /^(.|..)$/)
		{
			next;
		}
		if ($dirItem =~ /_vti_cnf$|themes$/)
		{
			next;
		}
		
		# Skip this file. The welcome message of the BlogEngine.
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx
		if ($dirItem =~ /c3b491e5-59ac-4f6a-81e5-27e971b903ed.xml/)
		{
			next;
		}
		
		my $path = $directory . "/" . $dirItem;
		
		if (-f $path && $path =~ /.xml$/i)
		{
			LoadPost($path);
		}
		if (-d $path)
		{
			Walk($path);
		}
	}
} # Walk()

# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-2.aspx
sub LoadPost
{
	my ($filenamePost) = @_;
	print "===============================";
	print "========== LOAD POST ==========";
	print "===============================";
	print "File: " . $filenamePost . "";
	# Load a post fields
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-load-a-post.aspx
	#my $filenamePost = "/usr/local/bin/jensblog/App_Data/posts/6d781b81-27ab-4ecf-a4ce-df2795128f0d.xml";
	$xmldoc = $parser-≻parse_file($filenamePost);

	my $strEncoding = $xmldoc-≻encoding();

	print "Encoding: " . $strEncoding . "";

	my %dictStringFieldToValue = ();
	my %dictBoolFieldToValue = ();

	my $postID = 0;
	if ($insertPost)
	{
		for my $node ($xmldoc-≻findnodes("/post/*"))
		{
			if ($node -≻ nodeName() =~ /author|title|description|slug|content|pubDate|lastModified/)
			{
				$dictStringFieldToValue{$node -≻ nodeName()} = $node-≻textContent();
			}
			elsif ($node -≻ nodeName() =~ /ispublished|isdeleted|iscommentsenabled/)
			{
				my $boolValue = 0;
				if ($node-≻textContent() eq "True")
				{
					$boolValue = 1;
				}
				$dictBoolFieldToValue{$node -≻ nodeName()} = $boolValue;
			}
		}

		# Clean up the slug. Resolve duplicate slug names.
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx
		$dictStringFieldToValue{"slug"} =~ s/^-?(.*)-?$/$1/;
		$dictStringFieldToValue{"title"} =~ s/^s?(.*)s?$/$1/;
		my $iSlugCounter = 2;
		while (exists ($dictSlugToID{$dictStringFieldToValue{"slug"}}))
		{
			$dictStringFieldToValue{"slug"} = $dictStringFieldToValue{"slug"} . "-" . $iSlugCounter;
			$iSlugCounter++;
		}
		print "Slug: " . $dictStringFieldToValue{"slug"} . "";
		
		$dbh-≻do(
			'INSERT INTO Post (Author,Title,Description,Slug,Content,' .
			'IsPublished,IsDeleted,IsCommentsEnabled,' .
			'PublishedOn,ModifiedOn) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', undef,
			$dictStringFieldToValue{"author"},
			$dictStringFieldToValue{"title"},
			$dictStringFieldToValue{"description"},
			$dictStringFieldToValue{"slug"},
			$dictStringFieldToValue{"content"},
			$dictBoolFieldToValue{"ispublished"},
			$dictBoolFieldToValue{"isdeleted"},
			$dictBoolFieldToValue{"iscommentsenabled"},
			$dictStringFieldToValue{"pubDate"},
			$dictStringFieldToValue{"lastModified"}
			);
		$postID = $dbh-≻{mysql_insertid};
		
		# Remember this slug.
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx
		$dictSlugToID{$dictStringFieldToValue{"slug"}} = $postID;
		
		print "-----------------------------
Post ID was: " . $postID . "
-----------------------------
";
	}

# Decode entities
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx
$dictStringFieldToValue{"content"} = decode_entities($dictStringFieldToValue{"content"}, "200-377");
	
	my $strTagName = "";
	my $strWidth = "";
	my $strHeight = "";
	my $strTarget = "";
	my $bIsLocalURL = 0;
	my $strFileName = "";
	my $strStyle = "";
	my $bIsFileContent = 0;
	my $bIsAlreadyLoadedInAnotherPost = 0;
	my %dictURLToDatabaseID = ();
	print "Process URLs";
	# URL and Image processing
	# Here we parse the post content. Extract URLs and store these in the URL entity.
	# Download images to be stored locally. Find and replace all URLs with URL place holders.
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-1.aspx
	my $iUrlCount = 0;
	my $content = $dictStringFieldToValue{"content"};
	my $contentParsed = "";
	my $contentResult = "";
	while ($content =~ /^(.*?)(≺a.+?≻|≺img.+?/≻)/msi)
	{
		print "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-";
		print "~-~-~-~-~-~ PROCESS URL ~-~-~-~-~-~-";
		print "~-~-~-~-~-~-~- START -~-~-~-~-~-~-~-";
		
		$iUrlCount++;
		$content = substr($content, length($1 . $2));
		$contentParsed .= $1;
		$contentResult .= $1; # Wait with saving the tag until after it has been replaced with a place holder.
		my $tag = $2;
		print "Tag at offset : " . length($contentParsed =~ s/
/
/r) . "";
		$contentParsed .= $tag;
		$bIsAlreadyLoadedInAnotherPost = 0;
		
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-4.aspx
		$bIsLocalURL = $tag =~ /(malmgren.nl|blogspot.com|googleusercontent.com|/image)/i;
		$bIsFileContent = $tag =~ /.(pdf|jpg|gif|png)/i;

		if ($tag =~ /^≺([a-z]+)/i)
		{
			$strTagName = lc($1);
		}
		
		if ($tag =~ /widths*=s*"(.+?)"/i)
		{
			$strWidth = $1;
		}
		else
		{
			$strWidth = "";
		}
		
		if ($tag =~ /heights*=s*"(.+?)"/i)
		{
			$strHeight = $1;
		}
		else
		{
			$strWidth = "";
		}
			
		if ($tag =~ /styles*=s*"(.*?)"/i)
		{
			$strStyle = $1;
		}
		else
		{
			$strStyle = "";
		}
		
		# Replace an URL with an URL place holder. Store the URL in dictURLToDatabaseID so that
		# later on it is possible to search for URLs a second time.
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-3.aspx
		my $iDatabaseIDofThisURLentry = -1;
		if ($tag =~ /(.*?)(https?://|/image)(.+?)["']/i)
		{
			my $strParsedURL = $2.$3;
			$strTarget = uri_decode($strParsedURL);
			$strTarget =~ s//([iI])mage/http://www.jens.malmgren.nl/$1mage/;
					
			if (exists $dictTargetAndTagNameToURLID{$strTarget . "|" . $strTagName})
			{
				print "Already present: " . $strTarget . "|" . $strTagName. "";
				# URL record already available for this URL
				$iDatabaseIDofThisURLentry = $dictTargetAndTagNameToURLID{$strTarget . "|" . $strTagName};
				$bIsAlreadyLoadedInAnotherPost = 1;
			}
			else
			{
				# Insert the URL record. Get the ID of it for replacing URLs with place holders such as /media/PICT3080_s1600.JPG
				# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx
				$dbh-≻do(
					'INSERT INTO URL (TagName,    Width,    Height,    IsLocalURL,  Style,    IsFileContent  ) ' .
					'VALUES 		 (?,  	      ?,        ?,         ?,           ?,        ?              )' , undef,
									  $strTagName,$strWidth,$strHeight,$bIsLocalURL,$strStyle,$bIsFileContent);
				$dictTargetAndTagNameToURLID{$strTarget . "|" . $strTagName} = $iDatabaseIDofThisURLentry = $dbh-≻{mysql_insertid};
			}

			$dictURLToDatabaseID{$strTarget} = $iDatabaseIDofThisURLentry;
			$tag =~ s/^(.*?)($strParsedURL)(.*?)$/$1{URL:$iDatabaseIDofThisURLentry}$3/;
		}
		
		# Change the image filename to indicate the resolution of the image.
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-4.aspx
		$strFileName = $strTarget;
		if ($strFileName =~ s/(.*?/)(s[0-9]+)(/)([a-z0-9]+)(.[a-z]{3,4})$/$4_$2$5/i)
		{
			print "Transformed filename: " . $strFileName . "";
		}
		else
		{
			$strFileName = $strTarget;
			$strFileName =~ s/(.*?/)([^/]+$)/$2/;
		}	

		# Is the URL pointing to a file to be downloaded?
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-6.aspx
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-4.aspx
		my $strFullPathAndImageFileName = "/usr/local/bin/images/" . $strFileName;
		if ($bIsLocalURL && $bIsFileContent && $downloadImages && !$bIsAlreadyLoadedInAnotherPost && !-f $strFullPathAndImageFileName)
		{
			print "Get target " . $strTarget . " save to " . $strFileName . "";
			my $data = LWP::Simple::get($strTarget);
			open (FH, "≻$strFullPathAndImageFileName");
			binmode (FH);
			print FH $data;
			close (FH);
		}

		$contentResult .= $tag; # Tag is processed. Here we add it.
		
		# Update the FileName and Target of the current URL,
		# Connect the Post with this URL by creating the PostURL record.
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx
		if (!$bIsAlreadyLoadedInAnotherPost)
		{
			$dbh-≻do('UPDATE URL SET Target = ?, Filename = ? WHERE ID = ?' , undef,
					 $strTarget, $strFileName , $iDatabaseIDofThisURLentry);
		}
		
		# Ignore duplicates of links between a Post and URL because that indicated that the same URL is used twize in a post.
		# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx
		eval {
			$dbh-≻do(
				'INSERT INTO PostURL (URLID,                    PostID  )' .
				'VALUES 		     (?,  	                    ?       )' , undef,
									  $iDatabaseIDofThisURLentry,$postID);
			};
		print "PostURL already available. PostID: " . $postID . ", URLID: " . $iDatabaseIDofThisURLentry . ". Message: $@
" if $@;

		print "iDatabaseIDofThisURLentry " . $iDatabaseIDofThisURLentry. "
";
		print "postID " . $postID . "
";
		print "iUrlCount " . $iUrlCount . "
";
		print "strTagName " . $strTagName . "
";
		print "strWidth " . $strWidth . "
";
		print "strHeight " . $strHeight . "
";
		print "strTarget " . $strTarget . "
";
		print "bIsLocalURL " . $bIsLocalURL . "
";
		print "strStyle " . $strStyle . "
";
		print "strFileName " . $strFileName . "
";
		print "bIsFileContent " . $bIsFileContent . "
";
	}

	print "~-~-~-~-~-~-~-~ END ~-~-~-~-~-~-~-~-
";
	print "~-~-~-~-~-~ PROCESS URL ~-~-~-~-~-~-
";
	print "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
";

	$contentResult .= $content; # Add any text after the last tag.

	# Post processing the content of the post to find any remaining URLs in the text.
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-3.aspx
	foreach my $strParsedURL ( keys %dictURLToDatabaseID )
	{
	  print "URL: $strParsedURL, DatabaseID: $dictURLToDatabaseID{$strParsedURL}
";
	  $contentResult =~ s/$strParsedURL/{URL:$dictURLToDatabaseID{$strParsedURL}}/g;
	}

	# Here all URLs of the content replaced with place holders. Now it can be updated to the database.
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx
	$dbh-≻do('UPDATE Post SET Content = ? WHERE ID = ?' , undef, $contentResult , $postID);

	# Load categories of a post. Connect the Category entity to this Post through the PostCategory entity.
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-restart.aspx
	if ($insertPostCategories)
	{
		for my $node ($xmldoc-≻findnodes("/post/categories/category"))
		{
			print "Category: " . $node-≻textContent() . " " . $dictGUIDToID{$node-≻textContent()} . "
";
			$dbh-≻do('INSERT INTO PostCategory (PostID,CategoryID) VALUES (?, ?)' , undef, $postID, $dictGUIDToID{$node-≻textContent()} );
		}
	}

	# Load tags of a post. Connect the Tags entity to this Post through the PostTag entity.
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-tags.aspx
	if ($loadTags)
	{
		for my $node ($xmldoc-≻findnodes("/post/tags/tag"))
		{
			print "Loaded tag: " . $node-≻textContent() . "
";
			my $sth = $dbh -≻ prepare('SELECT ID FROM Tag WHERE Name = ?');
			$sth-≻execute( $node-≻textContent() );
			my $tagID = "-";
			if(my $row = $sth-≻fetchrow_hashref())
			{
				$tagID = $$row{"ID"};
				print "Tag found in database: " . $node-≻textContent() . " with ID: " . $tagID ."
";
			}
			if ($tagID eq "-")
			{
				print "Tag not found in database: " . $node-≻textContent() . "
";
				$dbh-≻do('INSERT INTO Tag (Name) VALUES (?)' , undef, $node-≻textContent() );
				$tagID = $dbh-≻{mysql_insertid};
			}
			$dbh-≻do('INSERT INTO PostTag (PostID,TagID) VALUES (?, ?)' , undef, $postID, $tagID );
		}
	}

	# Load feedback.
	# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-comments.aspx
	if ($loadFeedback)
	{
		for my $node ($xmldoc-≻findnodes("/post/comments/comment"))
		{
			print "Feedback on: " . getField("date", $node) . "
";
			$dbh-≻do(
				'INSERT INTO Feedback (PostID, Date, Author, Email, IP, Content, IsApproved, IsDeleted, IsSpam)' .
				'VALUES 			 (?,  	  ?,    ?,      ?,     ?,  ?,       ?,          ?,         ?     )' , undef,
				$postID,
				getField("date", $node),
				getField("author", $node),
				getField("email", $node),
				getField("ip", $node),
				getField("content", $node),
				getBooleanAttribute("approved", $node),
				getBooleanAttribute("deleted", $node),
				getBooleanAttribute("spam", $node)
			);
		}
	}
} # LoadPost()

# Used to load True/False attributes in feedback
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-comments.aspx
sub getBooleanAttribute
{
	my ($ip_strName, $node) = @_;
	my $result = 0;
	if ($node-≻getAttribute($ip_strName) eq "True")
	{
		$result = 1;
	}
	return $result;
} # getBooleanAttribute()

# Used to load feedback data
# http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-comments.aspx
sub getField
{
	my ($ip_strName, $node) = @_;
	my @fieldNodeList = $node -≻ findnodes($ip_strName);
	if (scalar @fieldNodeList ≻ 0)
	{
		return $fieldNodeList[0] -≻ textContent();
	}
	else
	{
		return "";
	}
} # getField()

My next project will be to manipulate the webserver.

I was born 1967 in Stockholm, Sweden. I grew up in the small village Vågdalen in north Sweden. 1989 I moved to Umeå to study Computer Science at University of Umeå. 1995 I moved to the Netherlands where I live in Almere not far from Amsterdam.

Here on this site I let you see my creations.

I create, that is my hobby.